Source code for pycif.plugins.transforms.basic.unit_conversion.convert

"""Unit conversion utility for the unit_conversion transform.

This module computes a **scalar scaling factor** ``f`` such that

    value_out = f * value_in

given a source unit string and a target unit string.

Supported unit families
========================

**Dimensional units via pint:**
    Any unit recognised by `pint <https://pint.readthedocs.io>`__ —
    kg, g, mol, s, m, Pa, etc., including compound units such as
    ``kg m-2 s-1`` or ``nmol mol-1``.

**Atmospheric mixing-ratio shorthands:**
    ``ppm``, ``ppb``, ``ppt`` (and ``v``-suffixed variants) are
    dimensionless mole-fraction ratios registered as custom pint units:

    - 1 ppm  = 1e-6  mol mol⁻¹
    - 1 ppb  = 1e-9  mol mol⁻¹
    - 1 ppt  = 1e-12 mol mol⁻¹

**Species-qualified mass units (geochemistry conventions):**
    ``kgC``, ``TgCO2``, ``GtN``, ``kgCH4``, etc. encode the molecular
    or atomic weight of a specific species.  pint does not know these;
    they are pre-processed here by splitting the SI prefix+base unit from
    the species qualifier (e.g. ``kgC`` → base ``kg``, species ``C``
    with M = 12.011 g mol⁻¹).  When source and target qualifiers differ,
    the inter-species factor ``M_in / M_out`` is applied automatically.

**Time unit aliases:**
    ``y`` and ``yr`` are mapped to ``year`` before pint sees the string,
    because ``y`` is the SI yocto prefix (1×10⁻²⁴) in standard pint.

Usage
=====

::

    from .convert import get_conversion_factor

    get_conversion_factor("kg", "g")               # 1000.0
    get_conversion_factor("ppb", "mol mol-1")       # 1e-9
    get_conversion_factor("kgC m-2 yr-1",
                          "gC m-2 s-1")            # 1000 / 3.156e7
    get_conversion_factor("kgCO2 m-2 s-1",
                          "kgC m-2 s-1")           # 12.011 / 44.009
    get_conversion_factor("kg kg-1", "ppb",
                          mol_weight=16.043)        # 28.97/16.043 × 1e9

Dependencies
============

Requires ``pint`` (``pip install pint``).
"""

from __future__ import annotations

import re
from typing import Optional, Tuple

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

#: Dry-air mean molecular weight (g mol⁻¹)
_M_AIR: float = 28.97

#: Known species qualifiers and their molecular weights (g mol⁻¹).
#: Keys are matched case-sensitively at the END of a unit token.
SPECIES_MOLAR_MASS: dict[str, float] = {
    # Carbon species
    "C":    12.011,
    "CO2":  44.009,
    "CH4":  16.043,
    "CO":   28.010,
    # Nitrogen species
    "N":    14.007,
    "N2O":  44.013,
    "NO2":  46.006,
    "NO":   30.006,
    "NH3":  17.031,
    "NOx":  46.006,   # treated as NO2 equivalent by convention
    # Sulphur species
    "S":    32.06,
    "SO2":  64.06,
    # Other
    "H2O":  18.015,
    "O3":   48.00,
    "HCHO": 30.026,
}

#: Time unit aliases not reliably handled by pint.
#: ``y`` is the SI yocto prefix (1e-24); ``mon`` is not standard.
_TIME_ALIASES: dict[str, str] = {
    "y":   "year",
    "yr":  "year",
    "mon": "month",
}


# ---------------------------------------------------------------------------
# pint registry (built once, lazily)
# ---------------------------------------------------------------------------

_ureg: Optional[object] = None


def _registry():
    """Return the shared pint UnitRegistry, building it on first call."""
    global _ureg
    if _ureg is not None:
        return _ureg

    try:
        import pint
    except ImportError as exc:
        raise ImportError(
            "The 'pint' package is required for automatic unit conversion. "
            "Install it with:  pip install pint"
        ) from exc

    ureg = pint.UnitRegistry()

    # Mixing-ratio shorthands not in standard pint / UDUNITS
    ureg.define("ppm  = 1e-6  mol / mol = ppmv")
    ureg.define("ppb  = 1e-9  mol / mol = ppbv")
    ureg.define("ppt  = 1e-12 mol / mol = pptv")

    _ureg = ureg
    return _ureg


# ---------------------------------------------------------------------------
# Unit string pre-processing
# ---------------------------------------------------------------------------

def _apply_time_aliases(unit_str: str) -> str:
    """Replace ambiguous time tokens (y, yr, mon) with pint-safe equivalents.

    Only replaces whole tokens (not sub-strings of a longer word) to avoid
    mangling unit prefixes such as ``ny`` (nano-year) or ``Mg``.

    Args:
        unit_str: raw unit string, e.g. ``"kgC m-2 y-1"``.

    Returns:
        Unit string with time aliases substituted.
    """
    for alias, full in _TIME_ALIASES.items():
        # Match the alias as a whole word (surrounded by non-alphanumeric chars
        # or at string boundaries), possibly followed by an exponent.
        unit_str = re.sub(
            r'(?<![a-zA-Z])' + re.escape(alias) + r'(?![a-zA-Z])',
            full,
            unit_str,
        )
    return unit_str


def _strip_species_qualifier(token: str) -> Tuple[str, Optional[str], Optional[float]]:
    """Split a unit token into (base_unit, species_qualifier, molar_mass).

    The species qualifier, if present, is appended directly to the SI
    prefix+unit with no separator (e.g. ``kgCO2``, ``TgC``, ``nmolCH4``).
    The longest matching qualifier wins (``CO2`` before ``C``).

    Args:
        token: a single unit token, e.g. ``"kgCO2"``, ``"Tg"``, ``"ppb"``.

    Returns:
        Tuple of:

        * ``base_unit`` (str) — the token with the qualifier removed.
        * ``qualifier``  (str or None) — the matched species string.
        * ``molar_mass`` (float or None) — the species M in g mol⁻¹.

    Examples::

        _strip_species_qualifier("kgCO2")  → ("kg",  "CO2", 44.009)
        _strip_species_qualifier("TgC")    → ("Tg",  "C",   12.011)
        _strip_species_qualifier("kg")     → ("kg",  None,  None)
        _strip_species_qualifier("ppb")    → ("ppb", None,  None)
    """
    # Try qualifiers longest-first to avoid "C" matching the start of "CO2"
    for qualifier in sorted(SPECIES_MOLAR_MASS, key=len, reverse=True):
        if token.endswith(qualifier):
            base = token[: -len(qualifier)]
            if base:  # ensure something remains (e.g. reject bare "C")
                return base, qualifier, SPECIES_MOLAR_MASS[qualifier]
    return token, None, None


def _parse_unit_string(unit_str: str) -> Tuple[str, Optional[str], Optional[float]]:
    """Pre-process a full unit string and extract any species qualifier.

    Applies time aliases, then scans each slash/space-delimited token for a
    species qualifier.  At most **one** qualifier is expected (e.g.
    ``kgC m-2 s-1`` — only the ``kgC`` token has a qualifier).

    Args:
        unit_str: raw unit string such as ``"kgC m-2 y-1"`` or
            ``"TgCO2 yr-1"``.

    Returns:
        Tuple of:

        * ``clean_unit`` (str) — the unit string with qualifier stripped
          and time aliases resolved, ready for pint.
        * ``qualifier``  (str or None) — the species qualifier found.
        * ``molar_mass`` (float or None) — its molar mass in g mol⁻¹.

    Raises:
        ValueError: if more than one distinct species qualifier is found in
            the same unit string (ambiguous).
    """
    unit_str = _apply_time_aliases(unit_str.strip())

    # Tokenise on whitespace and slashes, preserving exponents
    tokens = re.split(r'[\s/]+', unit_str)

    clean_tokens = []
    qualifier: Optional[str] = None
    molar_mass: Optional[float] = None

    for tok in tokens:
        # Separate any leading sign / numeric exponent from the unit name
        match = re.fullmatch(r'([+\-]?\d*\.?\d*)([a-zA-Z].*)', tok)
        if match:
            prefix_num, unit_part = match.group(1), match.group(2)
        else:
            prefix_num, unit_part = "", tok

        base, q, M = _strip_species_qualifier(unit_part)

        if q is not None:
            if qualifier is not None and q != qualifier:
                raise ValueError(
                    f"Ambiguous unit string '{unit_str}': found two different "
                    f"species qualifiers '{qualifier}' and '{q}'.  "
                    "Only one species qualifier per unit string is supported."
                )
            qualifier = q
            molar_mass = M

        clean_tokens.append(prefix_num + base)

    # Re-join preserving the original separators as best we can
    clean_unit = re.sub(
        r'[\s/]+',
        lambda m: m.group(0),
        unit_str,
    )
    # Simple reconstruction: replace original tokens with cleaned ones
    for orig, clean in zip(tokens, clean_tokens):
        clean_unit = clean_unit.replace(orig, clean, 1)

    return clean_unit, qualifier, molar_mass


# ---------------------------------------------------------------------------
# Dimensionality helpers
# ---------------------------------------------------------------------------

def _dimensionality(unit_str: str) -> dict:
    """Return the pint dimensionality dict for a pre-processed unit string."""
    return _registry().Quantity(1.0, unit_str).dimensionality


def _is_mass_fraction(unit_str: str) -> bool:
    """Return True if *unit_str* is a mass mixing ratio (kg kg⁻¹ family)."""
    try:
        return _dimensionality(unit_str) == _dimensionality("kg / kg")
    except Exception:
        return False


def _is_mole_fraction(unit_str: str) -> bool:
    """Return True if *unit_str* is a mole fraction (mol mol⁻¹, ppm, …)."""
    try:
        return _dimensionality(unit_str) == _dimensionality("mol / mol")
    except Exception:
        return False


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

[docs] def get_conversion_factor( unit_in: str, unit_out: str, mol_weight: Optional[float] = None, ) -> float: """Compute the scalar factor ``f`` such that ``value_out = f * value_in``. The function handles four conversion paths, applied in order: 1. **Identical units** — returns 1.0 immediately. 2. **Species qualifier mismatch** — inter-species factor ``M_in / M_out`` is applied first, then the residual dimensional conversion is handled by pint. Example: ``kgCO2 m-2 s-1`` → ``kgC m-2 s-1`` gives ``12.011 / 44.009``. 3. **Mass fraction ↔ mole fraction** — the cross-family factor ``M_air / M_species`` (or its inverse) is applied together with any scale difference within each family. Requires ``mol_weight``. 4. **Pure dimensional conversion** — delegated entirely to pint. Args: unit_in (str): Source unit string. Supports pint syntax plus mixing-ratio shorthands (``ppm``, ``ppb``, ``ppt`` / ``v`` variants), species qualifiers (``kgC``, ``TgCO2``, …), and time aliases (``y`` / ``yr`` → year). unit_out (str): Target unit string (same conventions). mol_weight (float, optional): Species molar mass in g mol⁻¹. Required only for mass-fraction ↔ mole-fraction conversions when neither unit carries a species qualifier. When qualifiers are present their molar masses are used instead and this argument is ignored. Returns: float: Multiplicative scaling factor. Raises: ImportError: if ``pint`` is not installed. ValueError: if the conversion is dimensionally impossible, ambiguous, or requires a molar mass that was not provided. Examples:: get_conversion_factor("kg", "g") # → 1000.0 get_conversion_factor("ppb", "mol mol-1") # → 1e-9 get_conversion_factor("kgC m-2 yr-1", "gC m-2 s-1") # → 1000 / 3.156e7 (mass unchanged, time+prefix only) get_conversion_factor("kgCO2 m-2 s-1", "kgC m-2 s-1") # → 12.011 / 44.009 (inter-species, base units cancel) get_conversion_factor("TgCO2 yr-1", "molCO2 s-1") # → 1e9 / (44.009e-3) / 3.156e7 get_conversion_factor("kg kg-1", "ppb", mol_weight=16.043) # → (28.97 / 16.043) * 1e9 """ if unit_in.strip() == unit_out.strip(): return 1.0 clean_in, q_in, M_in = _parse_unit_string(unit_in) clean_out, q_out, M_out = _parse_unit_string(unit_out) # Resolve molar masses ------------------------------------------------ # Explicit mol_weight overrides only when no qualifier is present. if M_in is None and mol_weight is not None and q_in is None: M_in = mol_weight if M_out is None and mol_weight is not None and q_out is None: M_out = mol_weight # ----------------------------------------------------------------------- # Step 1: inter-species factor (different qualifiers, e.g. CO2 → C) # ----------------------------------------------------------------------- inter_species_factor = 1.0 if q_in is not None and q_out is not None and q_in != q_out: # Both have qualifiers but they differ → apply M_in / M_out inter_species_factor = M_in / M_out # After applying this factor the base units are in the same # "element-equivalent" mass, so pint handles the rest. # ----------------------------------------------------------------------- # Step 2: mass fraction ↔ mole fraction (cross-family) # ----------------------------------------------------------------------- in_mass = _is_mass_fraction(clean_in) out_mass = _is_mass_fraction(clean_out) in_mole = _is_mole_fraction(clean_in) out_mole = _is_mole_fraction(clean_out) if (in_mass and out_mole) or (in_mole and out_mass): # Resolve effective molar mass for this cross-family conversion eff_M = M_in if M_in is not None else M_out if eff_M is None: raise ValueError( f"Converting '{unit_in}' to '{unit_out}' crosses the " "mass-fraction / mole-fraction boundary and requires a " "species molar mass. Pass mol_weight=<M in g/mol> or use " "a species-qualified unit such as 'kgCH4' or 'molCH4'." ) mass_to_mole = _M_AIR / eff_M cross_factor = mass_to_mole if (in_mass and out_mole) else (1.0 / mass_to_mole) # Reference units within each family for the pint residual ref_in = "kg / kg" if in_mass else "mol / mol" ref_out = "kg / kg" if out_mass else "mol / mol" ureg = _registry() scale_in = float(ureg.Quantity(1.0, clean_in ).to(ref_in ).magnitude) scale_out = float(ureg.Quantity(1.0, ref_out ).to(clean_out).magnitude) return float(inter_species_factor * scale_in * cross_factor * scale_out) # ----------------------------------------------------------------------- # Step 3: pure dimensional conversion via pint # ----------------------------------------------------------------------- try: ureg = _registry() pint_factor = float(ureg.Quantity(1.0, clean_in).to(clean_out).magnitude) return float(inter_species_factor * pint_factor) except Exception as exc: raise ValueError( f"Cannot convert '{unit_in}' to '{unit_out}': {exc}\n" "If a species molar mass is needed, pass mol_weight=<M in g/mol> " "or use qualified units such as 'kgCO2' / 'kgC'." ) from exc