Source code for pycif.plugins.datastreams.fields.grib2_ecmwf.utils

from __future__ import annotations

import datetime
import itertools
import logging
import os
import re
import threading
from os import PathLike
from pathlib import Path
from typing import Any, Literal, overload

import cfgrib
import numpy as np
from .....utils.check.errclass import CifFileNotFoundError, CifKeyError, CifValueError

# ecCodes (used internally by cfgrib/gribapi) is not thread-safe: concurrent
# open/read/release calls from different dask worker threads can corrupt its
# shared C-level state and segfault the interpreter. Serialize all access.
_grib_lock = threading.Lock()



[docs]
class GribDataset:
    """Thread-safe wrapper around ``cfgrib`` to read GRIB files.

    Exposes GRIB variables and attributes through a small, thread-safe API
    (all access to the underlying ``cfgrib``/ecCodes handles is serialized
    via a module-level lock, since ecCodes is not thread-safe).

    Args:
        path: Path to the GRIB file.
        read_keys: Additional GRIB keys to read, on top of the default
            ones read by ``cfgrib``. Defaults to none.
        filter_by_keys: Dict of GRIB keys/values passed to
            ``cfgrib.open_file`` to filter which messages are read.
            Defaults to none.

    Raises:
        CifFileNotFoundError: If ``path`` does not exist.

    Example:
        >>> ds = GribDataset(
                "path/to/file.grib",
                read_keys=["isOctahedral"],
                filter_by_keys={"edition": 1}
            )
        >>> ds["latitude"]  # get a variable
        array([...])
        >>> ds.get_attr("isOctahedral")  # get an attribute
        1
    """

    def __init__(
        self,
        path: str | PathLike[str],
        read_keys: list[str] | None = None,
        filter_by_keys: dict[str, Any] | None = None,
    ) -> None:
        self.path = Path(path)
        read_keys = [] if read_keys is None else read_keys
        filter_by_keys = {} if filter_by_keys is None else filter_by_keys

        if not self.path.is_file():
            raise CifFileNotFoundError(f"{self.path} was not found")

        # Disable logging to avoid warning messages
        logger = logging.getLogger()
        log_level = logger.level
        logger.setLevel(10 * log_level)

        try:
            with _grib_lock:
                self._ds = cfgrib.open_file(
                    self.path,
                    indexpath="",
                    read_keys=read_keys,
                    filter_by_keys=filter_by_keys,
                )
        finally:
            logger.setLevel(log_level)


[docs]
    def close(self) -> None:
        """Explicitly release the underlying cfgrib handles under the grib lock,
        instead of leaving it to a garbage-collection pass on an arbitrary thread.
        """
        with _grib_lock:
            self._ds = None


    def __del__(self) -> None:
        self.close()

    @property
    def variables(self) -> dict[str, cfgrib.dataset.Variable]:
        """Returns the variables mapping of the GRIB file"""
        return self._ds.variables

    @overload
    def __getitem__(self, key: str) -> np.ndarray: ...
    @overload
    def __getitem__(self, key: list[str]) -> list[np.ndarray]: ...
    def __getitem__(self, key: str | list[str]) -> np.ndarray | list[np.ndarray]:
        """Get one or several variables from the GRIB file.

        Args:
            key: Variable name, or list of variable names, to get.

        Returns:
            The variable data as a single Numpy array (if ``key`` is a
            string) or a list of Numpy arrays (if ``key`` is a list).
        """
        varname_list = [key] if isinstance(key, str) else key
        data = []

        for varname in varname_list:
            data.append(self.get_var(varname))

        if isinstance(key, str):
            return data[0]
        return data


[docs]
    def get_var(self, varname: str) -> np.ndarray:
        """Get one variable from the GRIB file as a Numpy array.

        Args:
            varname: Variable name.

        Returns:
            np.ndarray: Variable data.

        Raises:
            CifKeyError: If the variable is not found in the GRIB file.
        """
        if varname not in self.variables:
            raise CifKeyError(
                f"The variable {varname} is not available is the file {self.path}. "
                + f"Available variables: {list(self.variables.keys())}"
            )

        with _grib_lock:
            data = self.variables[varname].data
            if hasattr(data, "build_array"):
                data = data.build_array()  # type: ignore

        return data



[docs]
    def get_attr(self, attr: str, varname: str | None = None) -> Any:
        """Get an attribute from the GRIB file.

        If ``varname`` is provided, looks for the attribute in that
        variable's attributes; otherwise looks for it across all variables
        and returns the first one found.

        Args:
            attr: Attribute name to get (will be prefixed with ``"GRIB_"``
                to match ``cfgrib`` attribute naming).
            varname: Variable name to restrict the search to. Defaults to
                searching all variables.

        Returns:
            Any: The attribute value.

        Raises:
            CifKeyError: If the attribute is not found.
        """
        grib_attr = f"GRIB_{attr}"

        if varname is not None:
            if grib_attr not in self.variables[varname].attributes:
                raise CifKeyError(
                    f"Could not find attribute {attr} in variable {varname} of file {self.path}"
                )

            return self.variables[varname].attributes[grib_attr]

        for varname in self.variables:
            if grib_attr in self.variables[varname].attributes:
                return self.variables[varname].attributes[grib_attr]

        raise CifKeyError(f"Could not find attribute {attr} in file {self.path}")





[docs]
def find_valid_file(
    file_format,
    dd,
    time_freq,
    ref_dir,
    ref_dir_next,
    ref_dir_previous,
    delta_tolerance=1,
    cumul_variable=False,
    cumul_length=12,
):
    """Find the GRIB file(s) closest to a requested date.

    Lists files in ``ref_dir`` (plus, when the cumulation window straddles
    a month boundary, ``ref_dir_previous``/``ref_dir_next``), parses their
    names against ``file_format`` to recover each file's valid date and
    forecast lead time, and picks the file with the closest valid date at
    or before ``dd`` (preferring the smallest forecast lead time in case of
    a tie). If ``cumul_variable`` is set, also finds the next file after
    that date, matching the same forecast lead time, so the pair can be
    used to decumulate the variable.

    Args:
        file_format: Date-format pattern (with optional ``{di}`` forecast
            lead-time placeholder) used both to build expected file names
            and to parse actual file names into dates.
        dd: The date to find a valid file for.
        time_freq: Expected time step between files, used together with
            ``delta_tolerance`` to bound how far the closest file's date
            may be from ``dd``.
        ref_dir: Directory to search for candidate files.
        ref_dir_next: Directory to also search when the cumulation window
            crosses into the next month.
        ref_dir_previous: Directory to also search when the cumulation
            window crosses into the previous month.
        delta_tolerance: Tolerance, as a multiple of ``time_freq``, allowed
            between ``dd`` and the closest file's valid date.
        cumul_variable: If True, also locate the next file needed to
            decumulate the variable.
        cumul_length: Cumulation window length, in hours, used to decide
            whether ``ref_dir_previous``/``ref_dir_next`` must be searched.

    Returns:
        tuple: ``(files, dates)``, each a list with one entry (or two, when
        ``cumul_variable`` is set and a next file is found) giving the
        matched file path(s) and corresponding date(s) (re-expressed
        relative to ``dd``).

    Raises:
        CifFileNotFoundError: If no candidate file matches ``file_format``,
            or if none is close enough to ``dd`` (within
            ``delta_tolerance * time_freq``).
        CifValueError: If decumulation is requested but no next file with a
            matching forecast lead time can be found.
    """
    # Get all files and dates matching the file and format
    # list_files_orig = os.listdir(ref_dir)
    ref_dir = Path(ref_dir)
    ref_dir_next = Path(ref_dir_next)
    ref_dir_previous = Path(ref_dir_previous)

    list_files_orig = list(ref_dir.iterdir())

    # Convert ref date
    ref_date = datetime.datetime.strptime(dd.strftime(file_format), file_format)
    previous_date = ref_date - datetime.timedelta(hours=cumul_length)
    if previous_date.month < ref_date.month or previous_date.year < ref_date.year:
        list_files_orig += list(ref_dir_previous.iterdir())

    next_date = ref_date + datetime.timedelta(hours=cumul_length)
    if next_date.month > ref_date.month or next_date.year < ref_date.year:
        list_files_orig += list(ref_dir_next.iterdir())

    list_dates_cur = []
    list_forecast_cur = []
    list_forecast_hour = []
    list_files_cur = []
    for f in list_files_orig:
        # Ignore index files generated by xarray and cfgrib
        if str(f).find("idx") >= 0:
            continue

        basef = f.name

        # Fetch date information
        re_format = (
            file_format.replace(".", "/")
            .replace("%Y", "(\\d{4})")
            .replace("%m", "(\\d{2})")
            .replace("%d", "(\\d{2})")
            .replace("%H", "(\\d{1,2})")
            .replace("%M", "(\\d{2})")
            .replace("*", "\\d{1,2}")
            .replace("{di}", "(\\d{1,2})")
        )
        match_file = re.search(re_format, basef.replace(".", "/"))
        if match_file is None:
            continue

        if match_file.span() != (0, len(basef)):
            continue

        patterns = ["%Y", "%m", "%d", "%H", "%M", "{di}"]
        pstarts = []
        pout = []
        for p in patterns:
            match = re.search(p.replace("%", "\\%"), file_format)
            if match is not None:
                pstarts.append(match.start())
                pout.append(p)

        di = 0
        groups = list(match_file.groups())
        if "{di}" in file_format:
            index_di = sorted(pstarts).index(pstarts[-1])
            di_str = match_file.groups()[index_di]
            di = int(di_str)
            pout = pout[:-1]
            pstarts = pstarts[:-1]
            groups.pop(index_di)

        # Deal with time stamps at 24h
        shift_hour = 0
        if "%H" in pout:
            shift_hour = int(groups[pout.index("%H")])
            groups[pout.index("%H")] = "00"

        date_cur = datetime.datetime.strptime(
            "".join([groups[k] for k in np.argsort(pstarts)]), "".join(pout)
        )
        date_cur += datetime.timedelta(hours=di + shift_hour)

        list_dates_cur.append(date_cur)
        list_files_cur.append(str(f))
        list_forecast_cur.append((date_cur - datetime.timedelta(hours=di)).hour)
        list_forecast_hour.append(di)

    list_files = np.array(list_files_cur)
    list_dates = np.array(list_dates_cur)
    list_forecast_cur = np.array(list_forecast_cur)
    list_forecast_hour = np.array(list_forecast_hour)

    # Sorting along dates
    isort = np.argsort(list_dates)
    list_dates = list_dates[isort]
    list_files = list_files[isort]
    list_forecast_cur = list_forecast_cur[isort]
    list_forecast_hour = list_forecast_hour[isort]
    if list_files.size == 0:
        raise CifFileNotFoundError(
            f"Did not find any valid GRIB files in {ref_dir} "
            f"with format {file_format}. Please check your yml file"
        )

    # Find nearest previous date
    mask = (list_dates - ref_date) <= datetime.timedelta(0)
    if mask.sum() == 0:
        raise CifFileNotFoundError(
            f"No file has valid date for {ref_date} in {ref_dir} "
            f"with format {file_format}. Please check your yml file. \n"
            f"The range of dates covered by files is: {list_dates.min()} / {list_dates.max()}"
        )

    max_date = np.max(list_dates[mask])
    mask = list_dates == max_date

    # Fetch date with smaller forecast hour
    mask = mask & (list_forecast_cur == list_forecast_cur[mask].min())

    ind_date = np.where(mask)[0][0]
    file_ref1 = list_files[ind_date]
    date_ref1 = list_dates[ind_date]
    forecast_hour_ref1 = list_forecast_hour[ind_date]
    forecast_cur_ref1 = list_forecast_cur[ind_date]

    # Check that date_ref1 is not too far away from ref_date
    delta = date_ref1 - ref_date
    if np.abs(delta) > delta_tolerance * time_freq:
        raise CifFileNotFoundError(
            f"Could not find files close enough to the expected date (date_ref1):\n"
            f"\t- requested date: {dd.isoformat()}\n"
            f"\t- date using file formating: {ref_date.isoformat()}\n"
            f"\t- closest valid date: {date_ref1.isoformat()}\n"
            f"\t- timedelta from expected date: {delta!s}\n"
            f"\t- timedelta from file frequency: {time_freq!s}\n"
        )

    # Deal differently between cumulated and instantaneous variables
    # If not cumulative variable, just return instantaneous snapshot
    if not cumul_variable:
        dd1 = dd + (date_ref1 - ref_date)

        return [file_ref1], [dd1]

    #
    # Now find nearest next date
    #
    mask = (list_dates - ref_date) > datetime.timedelta(0)
    if mask.sum() == 0:
        return [file_ref1], [dd + (date_ref1 - ref_date)]

    min_date = np.min(list_dates[mask])
    mask = list_dates == min_date

    # Deal differently between cumulated and instantaneous variables
    # For cumulated variable, make sure that the forecast date is the same as the previous date
    mask = mask & (list_forecast_hour == forecast_hour_ref1)

    if mask.sum() == 0:
        raise CifValueError(
            f"Could not decumulate variable in files {file_format} "
            f"for date {dd} because could not guarantee that the same "
            "forecast hour is available for the two sides of the interval"
        )

    ind_date = np.where(mask)[0][0]
    file_ref2 = list_files[ind_date]
    date_ref2 = list_dates[ind_date]

    # Check that date_ref1 is not too far away from ref_date
    delta = date_ref2 - ref_date
    if np.abs(delta) > delta_tolerance * time_freq:
        raise CifFileNotFoundError(
            f"Could not find files close enough to the expected date (date_ref2):\n"
            f"\t- requested date: {dd.isoformat()}\n"
            f"\t- date using file formating: {ref_date.isoformat()}\n"
            f"\t- closest valid date: {date_ref2.isoformat()}\n"
            f"\t- timedelta from expected date: {delta!s}\n"
            f"\t- timedelta from file frequency: {time_freq!s}\n"
            f"\t- file format: {file_format}\n"
        )

    # Reconvert to original date
    dd1 = dd + (date_ref1 - ref_date)
    dd2 = dd + (date_ref2 - ref_date)

    return [file_ref1, file_ref2], [dd1, dd2]




[docs]
def get_grid_type(
    domain_file: str | PathLike[str],
    filter_by_keys_dict: dict[str, Any] | None = None,
) -> Literal["regular", "octahedral", "reduced_gaussian"]:
    """Determine the GRIB grid type of a reference file.

    Args:
        domain_file: Path to the GRIB file to inspect.
        filter_by_keys_dict: Dict of GRIB keys/values used to select which
            message to read from the file.

    Returns:
        ``"octahedral"`` if the GRIB ``isOctahedral`` attribute is set,
        ``"regular"`` if the latitude spacing is constant, or
        ``"reduced_gaussian"`` otherwise.
    """

    ds = GribDataset(
        domain_file,
        read_keys=["isOctahedral"],
        filter_by_keys=filter_by_keys_dict,
    )

    try:
        is_octahedral = ds.get_attr("isOctahedral") == 1
    except KeyError:
        is_octahedral = False

    if is_octahedral:
        return "octahedral"
    else:
        lat = ds["latitude"]

        if not np.any(np.diff(lat) == 0):
            return "regular"
        else:
            return "reduced_gaussian"




[docs]
def get_jscan(
    domain_file: str | PathLike[str],
    filter_by_keys_dict: dict[str, Any] | None = None,
) -> int:
    """Get the GRIB ``jScansPositively`` scan-direction attribute.

    Args:
        domain_file: Path to the GRIB file to inspect.
        filter_by_keys_dict: Dict of GRIB keys/values used to select which
            message to read from the file.

    Returns:
        int: ``0`` if latitudes scan from North to South (the default
        ECMWF convention), ``1`` if from South to North.

    Raises:
        CifValueError: If the attribute is not defined in the file.
    """
    ds = GribDataset(
        domain_file,
        read_keys=["jScansPositively"],
        filter_by_keys=filter_by_keys_dict,
    )

    try:
        jscan = ds.get_attr("jScansPositively")
    except KeyError as e:
        raise CifValueError(
            "When 'jScansPositively' is not defined, it is possible to force it "
            "with the argument 'jScansPositively' of your Yml"
        ) from e

    return jscan