"""Unit conversion utility for the unit_conversion transform.
This module computes a **scalar scaling factor** ``f`` such that
value_out = f * value_in
given a source unit string and a target unit string.
Supported unit families
========================
**Dimensional units via pint:**
Any unit recognised by `pint <https://pint.readthedocs.io>`__ —
kg, g, mol, s, m, Pa, etc., including compound units such as
``kg m-2 s-1`` or ``nmol mol-1``.
**Atmospheric mixing-ratio shorthands:**
``ppm``, ``ppb``, ``ppt`` (and ``v``-suffixed variants) are
dimensionless mole-fraction ratios registered as custom pint units:
- 1 ppm = 1e-6 mol mol⁻¹
- 1 ppb = 1e-9 mol mol⁻¹
- 1 ppt = 1e-12 mol mol⁻¹
**Species-qualified mass units (geochemistry conventions):**
``kgC``, ``TgCO2``, ``GtN``, ``kgCH4``, etc. encode the molecular
or atomic weight of a specific species. pint does not know these;
they are pre-processed here by splitting the SI prefix+base unit from
the species qualifier (e.g. ``kgC`` → base ``kg``, species ``C``
with M = 12.011 g mol⁻¹). When source and target qualifiers differ,
the inter-species factor ``M_in / M_out`` is applied automatically.
**Time unit aliases:**
``y`` and ``yr`` are mapped to ``year`` before pint sees the string,
because ``y`` is the SI yocto prefix (1×10⁻²⁴) in standard pint.
Usage
=====
::
from .convert import get_conversion_factor
get_conversion_factor("kg", "g") # 1000.0
get_conversion_factor("ppb", "mol mol-1") # 1e-9
get_conversion_factor("kgC m-2 yr-1",
"gC m-2 s-1") # 1000 / 3.156e7
get_conversion_factor("kgCO2 m-2 s-1",
"kgC m-2 s-1") # 12.011 / 44.009
get_conversion_factor("kg kg-1", "ppb",
mol_weight=16.043) # 28.97/16.043 × 1e9
Dependencies
============
Requires ``pint`` (``pip install pint``).
"""
from __future__ import annotations
import re
from typing import Optional, Tuple
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
#: Dry-air mean molecular weight (g mol⁻¹)
_M_AIR: float = 28.97
#: Known species qualifiers and their molecular weights (g mol⁻¹).
#: Keys are matched case-sensitively at the END of a unit token.
SPECIES_MOLAR_MASS: dict[str, float] = {
# Carbon species
"C": 12.011,
"CO2": 44.009,
"CH4": 16.043,
"CO": 28.010,
# Nitrogen species
"N": 14.007,
"N2O": 44.013,
"NO2": 46.006,
"NO": 30.006,
"NH3": 17.031,
"NOx": 46.006, # treated as NO2 equivalent by convention
# Sulphur species
"S": 32.06,
"SO2": 64.06,
# Other
"H2O": 18.015,
"O3": 48.00,
"HCHO": 30.026,
}
#: Time unit aliases not reliably handled by pint.
#: ``y`` is the SI yocto prefix (1e-24); ``mon`` is not standard.
_TIME_ALIASES: dict[str, str] = {
"y": "year",
"yr": "year",
"mon": "month",
}
# ---------------------------------------------------------------------------
# pint registry (built once, lazily)
# ---------------------------------------------------------------------------
_ureg: Optional[object] = None
def _registry():
"""Return the shared pint UnitRegistry, building it on first call."""
global _ureg
if _ureg is not None:
return _ureg
try:
import pint
except ImportError as exc:
raise ImportError(
"The 'pint' package is required for automatic unit conversion. "
"Install it with: pip install pint"
) from exc
ureg = pint.UnitRegistry()
# Mixing-ratio shorthands not in standard pint / UDUNITS
ureg.define("ppm = 1e-6 mol / mol = ppmv")
ureg.define("ppb = 1e-9 mol / mol = ppbv")
ureg.define("ppt = 1e-12 mol / mol = pptv")
_ureg = ureg
return _ureg
# ---------------------------------------------------------------------------
# Unit string pre-processing
# ---------------------------------------------------------------------------
def _apply_time_aliases(unit_str: str) -> str:
"""Replace ambiguous time tokens (y, yr, mon) with pint-safe equivalents.
Only replaces whole tokens (not sub-strings of a longer word) to avoid
mangling unit prefixes such as ``ny`` (nano-year) or ``Mg``.
Args:
unit_str: raw unit string, e.g. ``"kgC m-2 y-1"``.
Returns:
Unit string with time aliases substituted.
"""
for alias, full in _TIME_ALIASES.items():
# Match the alias as a whole word (surrounded by non-alphanumeric chars
# or at string boundaries), possibly followed by an exponent.
unit_str = re.sub(
r'(?<![a-zA-Z])' + re.escape(alias) + r'(?![a-zA-Z])',
full,
unit_str,
)
return unit_str
def _strip_species_qualifier(token: str) -> Tuple[str, Optional[str], Optional[float]]:
"""Split a unit token into (base_unit, species_qualifier, molar_mass).
The species qualifier, if present, is appended directly to the SI
prefix+unit with no separator (e.g. ``kgCO2``, ``TgC``, ``nmolCH4``).
The longest matching qualifier wins (``CO2`` before ``C``).
Args:
token: a single unit token, e.g. ``"kgCO2"``, ``"Tg"``, ``"ppb"``.
Returns:
Tuple of:
* ``base_unit`` (str) — the token with the qualifier removed.
* ``qualifier`` (str or None) — the matched species string.
* ``molar_mass`` (float or None) — the species M in g mol⁻¹.
Examples::
_strip_species_qualifier("kgCO2") → ("kg", "CO2", 44.009)
_strip_species_qualifier("TgC") → ("Tg", "C", 12.011)
_strip_species_qualifier("kg") → ("kg", None, None)
_strip_species_qualifier("ppb") → ("ppb", None, None)
"""
# Try qualifiers longest-first to avoid "C" matching the start of "CO2"
for qualifier in sorted(SPECIES_MOLAR_MASS, key=len, reverse=True):
if token.endswith(qualifier):
base = token[: -len(qualifier)]
if base: # ensure something remains (e.g. reject bare "C")
return base, qualifier, SPECIES_MOLAR_MASS[qualifier]
return token, None, None
def _parse_unit_string(unit_str: str) -> Tuple[str, Optional[str], Optional[float]]:
"""Pre-process a full unit string and extract any species qualifier.
Applies time aliases, then scans each slash/space-delimited token for a
species qualifier. At most **one** qualifier is expected (e.g.
``kgC m-2 s-1`` — only the ``kgC`` token has a qualifier).
Args:
unit_str: raw unit string such as ``"kgC m-2 y-1"`` or
``"TgCO2 yr-1"``.
Returns:
Tuple of:
* ``clean_unit`` (str) — the unit string with qualifier stripped
and time aliases resolved, ready for pint.
* ``qualifier`` (str or None) — the species qualifier found.
* ``molar_mass`` (float or None) — its molar mass in g mol⁻¹.
Raises:
ValueError: if more than one distinct species qualifier is found in
the same unit string (ambiguous).
"""
unit_str = _apply_time_aliases(unit_str.strip())
# Tokenise on whitespace and slashes, preserving exponents
tokens = re.split(r'[\s/]+', unit_str)
clean_tokens = []
qualifier: Optional[str] = None
molar_mass: Optional[float] = None
for tok in tokens:
# Separate any leading sign / numeric exponent from the unit name
match = re.fullmatch(r'([+\-]?\d*\.?\d*)([a-zA-Z].*)', tok)
if match:
prefix_num, unit_part = match.group(1), match.group(2)
else:
prefix_num, unit_part = "", tok
base, q, M = _strip_species_qualifier(unit_part)
if q is not None:
if qualifier is not None and q != qualifier:
raise ValueError(
f"Ambiguous unit string '{unit_str}': found two different "
f"species qualifiers '{qualifier}' and '{q}'. "
"Only one species qualifier per unit string is supported."
)
qualifier = q
molar_mass = M
clean_tokens.append(prefix_num + base)
# Re-join preserving the original separators as best we can
clean_unit = re.sub(
r'[\s/]+',
lambda m: m.group(0),
unit_str,
)
# Simple reconstruction: replace original tokens with cleaned ones
for orig, clean in zip(tokens, clean_tokens):
clean_unit = clean_unit.replace(orig, clean, 1)
return clean_unit, qualifier, molar_mass
# ---------------------------------------------------------------------------
# Dimensionality helpers
# ---------------------------------------------------------------------------
def _dimensionality(unit_str: str) -> dict:
"""Return the pint dimensionality dict for a pre-processed unit string."""
return _registry().Quantity(1.0, unit_str).dimensionality
def _is_mass_fraction(unit_str: str) -> bool:
"""Return True if *unit_str* is a mass mixing ratio (kg kg⁻¹ family)."""
try:
return _dimensionality(unit_str) == _dimensionality("kg / kg")
except Exception:
return False
def _is_mole_fraction(unit_str: str) -> bool:
"""Return True if *unit_str* is a mole fraction (mol mol⁻¹, ppm, …)."""
try:
return _dimensionality(unit_str) == _dimensionality("mol / mol")
except Exception:
return False
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
[docs]
def get_conversion_factor(
unit_in: str,
unit_out: str,
mol_weight: Optional[float] = None,
) -> float:
"""Compute the scalar factor ``f`` such that ``value_out = f * value_in``.
The function handles four conversion paths, applied in order:
1. **Identical units** — returns 1.0 immediately.
2. **Species qualifier mismatch** — inter-species factor ``M_in / M_out``
is applied first, then the residual dimensional conversion is handled
by pint. Example: ``kgCO2 m-2 s-1`` → ``kgC m-2 s-1`` gives
``12.011 / 44.009``.
3. **Mass fraction ↔ mole fraction** — the cross-family factor
``M_air / M_species`` (or its inverse) is applied together with any
scale difference within each family. Requires ``mol_weight``.
4. **Pure dimensional conversion** — delegated entirely to pint.
Args:
unit_in (str): Source unit string. Supports pint syntax plus
mixing-ratio shorthands (``ppm``, ``ppb``, ``ppt`` / ``v``
variants), species qualifiers (``kgC``, ``TgCO2``, …), and
time aliases (``y`` / ``yr`` → year).
unit_out (str): Target unit string (same conventions).
mol_weight (float, optional): Species molar mass in g mol⁻¹.
Required only for mass-fraction ↔ mole-fraction conversions
when neither unit carries a species qualifier. When qualifiers
are present their molar masses are used instead and this
argument is ignored.
Returns:
float: Multiplicative scaling factor.
Raises:
ImportError: if ``pint`` is not installed.
ValueError: if the conversion is dimensionally impossible, ambiguous,
or requires a molar mass that was not provided.
Examples::
get_conversion_factor("kg", "g")
# → 1000.0
get_conversion_factor("ppb", "mol mol-1")
# → 1e-9
get_conversion_factor("kgC m-2 yr-1", "gC m-2 s-1")
# → 1000 / 3.156e7 (mass unchanged, time+prefix only)
get_conversion_factor("kgCO2 m-2 s-1", "kgC m-2 s-1")
# → 12.011 / 44.009 (inter-species, base units cancel)
get_conversion_factor("TgCO2 yr-1", "molCO2 s-1")
# → 1e9 / (44.009e-3) / 3.156e7
get_conversion_factor("kg kg-1", "ppb", mol_weight=16.043)
# → (28.97 / 16.043) * 1e9
"""
if unit_in.strip() == unit_out.strip():
return 1.0
clean_in, q_in, M_in = _parse_unit_string(unit_in)
clean_out, q_out, M_out = _parse_unit_string(unit_out)
# Resolve molar masses ------------------------------------------------
# Explicit mol_weight overrides only when no qualifier is present.
if M_in is None and mol_weight is not None and q_in is None:
M_in = mol_weight
if M_out is None and mol_weight is not None and q_out is None:
M_out = mol_weight
# -----------------------------------------------------------------------
# Step 1: inter-species factor (different qualifiers, e.g. CO2 → C)
# -----------------------------------------------------------------------
inter_species_factor = 1.0
if q_in is not None and q_out is not None and q_in != q_out:
# Both have qualifiers but they differ → apply M_in / M_out
inter_species_factor = M_in / M_out
# After applying this factor the base units are in the same
# "element-equivalent" mass, so pint handles the rest.
# -----------------------------------------------------------------------
# Step 2: mass fraction ↔ mole fraction (cross-family)
# -----------------------------------------------------------------------
in_mass = _is_mass_fraction(clean_in)
out_mass = _is_mass_fraction(clean_out)
in_mole = _is_mole_fraction(clean_in)
out_mole = _is_mole_fraction(clean_out)
if (in_mass and out_mole) or (in_mole and out_mass):
# Resolve effective molar mass for this cross-family conversion
eff_M = M_in if M_in is not None else M_out
if eff_M is None:
raise ValueError(
f"Converting '{unit_in}' to '{unit_out}' crosses the "
"mass-fraction / mole-fraction boundary and requires a "
"species molar mass. Pass mol_weight=<M in g/mol> or use "
"a species-qualified unit such as 'kgCH4' or 'molCH4'."
)
mass_to_mole = _M_AIR / eff_M
cross_factor = mass_to_mole if (in_mass and out_mole) else (1.0 / mass_to_mole)
# Reference units within each family for the pint residual
ref_in = "kg / kg" if in_mass else "mol / mol"
ref_out = "kg / kg" if out_mass else "mol / mol"
ureg = _registry()
scale_in = float(ureg.Quantity(1.0, clean_in ).to(ref_in ).magnitude)
scale_out = float(ureg.Quantity(1.0, ref_out ).to(clean_out).magnitude)
return float(inter_species_factor * scale_in * cross_factor * scale_out)
# -----------------------------------------------------------------------
# Step 3: pure dimensional conversion via pint
# -----------------------------------------------------------------------
try:
ureg = _registry()
pint_factor = float(ureg.Quantity(1.0, clean_in).to(clean_out).magnitude)
return float(inter_species_factor * pint_factor)
except Exception as exc:
raise ValueError(
f"Cannot convert '{unit_in}' to '{unit_out}': {exc}\n"
"If a species molar mass is needed, pass mol_weight=<M in g/mol> "
"or use qualified units such as 'kgCO2' / 'kgC'."
) from exc