Source code for pycif.plugins.datastreams.fields.lmdz_chemfield_reg.fetch

from __future__ import annotations

import datetime
import os
from os import PathLike
from pathlib import Path

import pandas as pd

from .....utils import path
from .....utils.check.errclass import CifFileNotFoundError


# pylint: disable=unused-argument

[docs]
def fetch(
    ref_dir: str | PathLike,
    ref_file: str | PathLike,
    input_interval: tuple[datetime.datetime, datetime.datetime],
    target_dir: str | PathLike,
    tracer: object | None = None,
    **kwargs,
) -> tuple[
    dict[datetime.datetime, list[str | PathLike]],
    dict[datetime.datetime, list[tuple[datetime.datetime, datetime.datetime]]],
]:
    """Link monthly LMDz chemical field files and build daily sub-intervals.

    Builds the list of monthly file dates spanning ``input_interval`` at
    ``tracer.file_freq``, links each file into ``target_dir`` (raising if a
    file is missing), and for each linked file generates one daily
    ``[start, end]`` sub-interval per day of that month.

    Args:
        ref_dir: Directory where the original files are found. If both
            ``ref_dir`` and ``ref_file`` are falsy, nothing is fetched.
        ref_file: Date-format pattern for the original file names.
        input_interval: Tuple of two dates, the beginning and end of the
            period to fetch.
        target_dir: Directory where links to the original files are
            created.
        tracer: Tracer/component configuration; ``tracer.file_freq`` gives
            the frequency at which files are available.
        **kwargs: Unused.

    Returns:
        A tuple ``(list_files, list_dates)`` of dictionaries keyed by
        monthly file date. ``list_dates`` maps each key to one
        ``(day_start, day_end)`` interval per day of the month; ``list_files``
        maps each key to the (repeated) path of that month's file.

    Raises:
        CifFileNotFoundError: If an expected monthly file does not exist.
    """
    if not ref_dir and not ref_file:
        return {}, {}

    # Reshape input interval to include full months
    date_i, date_f = input_interval
    file_freq = tracer.file_freq  # type: ignore

    # Getting file dates
    file_dates = pd.date_range(date_i, date_f, freq=file_freq, inclusive="left")
    if file_dates.empty:
        file_dates = pd.to_datetime([date_i])
    if file_dates[0] > date_i:
        file_dates = pd.to_datetime([date_i] + file_dates.to_list())

    # Getting files paths
    file_paths = [Path(ref_dir, date.strftime(ref_file)) for date in file_dates]

    list_dates = {}
    list_files = {}

    for date, source_path in zip(file_dates, file_paths):
        if not source_path.is_file():
            raise CifFileNotFoundError(f"file '{source_path}' not found")

        # Fetching
        target_path = os.path.join(target_dir, os.path.basename(source_path))
        path.link(source_path, target_path)

        # Timestamps (assume monthly files with daily resolution)
        period_start = pd.date_range(date, periods=date.days_in_month, freq="1D")
        period_end = period_start + pd.offsets.Hour(24)

        # pylint: disable=no-member
        date = date.to_pydatetime()
        period_start = period_start.to_pydatetime()  # type: ignore
        period_end = period_end.to_pydatetime()

        list_dates[date] = list(zip(period_start, period_end))
        list_files[date] = len(period_start) * [str(target_path)]

    return list_files, list_dates