Source code for pycif.plugins.datastreams.fluxes.edgar_v5.fetch

import datetime
import glob
import os
import pandas as pd
import xarray as xr
import numpy as np
import glob
from netCDF4 import Dataset, num2date

from .....utils import path
from .....utils.hdf5 import _hdf5_lock
from logging import info, debug



[docs]
def fetch(ref_dir, ref_file, date_interval, target_dir,
          tracer=None, **kwargs):
    """Fetch EDGAR v5 yearly files, falling back to the closest available year.

    Expands ``date_interval`` to include full years, then scans all files
    matching ``ref_dir``/``ref_file`` between 1900 and 2100 to determine,
    for each requested year, the closest available year at or before it
    (the ``closest_year`` behavior is effectively always applied by this
    lookup). The resolved file is linked into ``target_dir``; if it has a
    ``time`` variable, it is read to build the per-record ``[start, end]``
    date pairs actually covered, otherwise the whole file is treated as
    covering the full requested year.

    Args:
        ref_dir (str): directory where the original files are found.
        ref_file (str): (template) name of the original files.
        date_interval (list): simulation interval, as a list of the two
            bounding dates.
        target_dir (str): directory where links to the original files are
            created.
        tracer: the tracer Plugin; unused directly but kept for interface
            compatibility.
        **kwargs: unused, kept for interface compatibility.

    Returns:
        (dict, dict): ``list_files`` (``tmp_files``) and ``list_dates``
        (``tmp_dates``).

        list_files: for each requested year, a list containing the name of
            the resolved file repeated once per covered date interval.
        list_dates: for each requested year, a list of ``[start, end]``
            date intervals covered by the resolved file.
    """

    # Reshape input interval to include full years
    datei, datef = date_interval
    datei = datetime.datetime(year=datei.year, month=1, day=1)
    datef = datetime.datetime(year=datef.year + 1, month=1, day=1)

    list_dates = pd.date_range(datei, datef, freq="1YS")

    # Find all available dates matching the format provided by the user
    list_dates_avail = np.array([
        d.to_pydatetime()
        for d in pd.date_range("1900", "2100", freq="1YS")
        if os.path.isfile(
            d.strftime(f"{ref_dir}/{ref_file}"))
    ])
    list_files_avail = np.array([
        d.strftime(f"{ref_dir}/{ref_file}")
        for d in pd.date_range("1900", "2100", freq="1YS")
        if os.path.isfile(
            d.strftime(f"{ref_dir}/{ref_file}"))
    ])

    list_files_avail = list_files_avail[np.argsort(list_dates_avail)]
    list_dates_avail = np.sort(list_dates_avail)

    # Loop over years to find correct file
    tmp_files = {}
    tmp_dates = {}
    for dd in list_dates:
        deltas = dd - list_dates_avail

        if np.any(deltas >= datetime.timedelta(0)):
            delta_max = np.min(deltas[deltas >= datetime.timedelta(0)])
            indout = np.where(deltas == delta_max)[0][0]
        else:
            indout = 0

        file = list_files_avail[indout]

        # Check if "time" is in variables
        # Do not do it with open_dataset which is slow with big files...
        with _hdf5_lock:
            with Dataset(file, "r") as f:
                available_time = "time" in f.variables

        if available_time:
            debug(f"Fetching times for date {dd} from {file}")
            with _hdf5_lock:
                with Dataset(file, "r") as f:
                    times = f.variables["time"]
                    dates = num2date(times[:], times.units,
                                     only_use_python_datetimes=True,
                                     only_use_cftime_datetimes=False)

            dt = np.unique(np.diff(dates))[0]
            tmp_dates[dd] = [[d - dt, d] for d in dates]
            tmp_files[dd] = len(tmp_dates[dd]) * [file]

        else:
            tmp_dates[dd] = [[dd.to_pydatetime(),
                              datetime.datetime(dd.year + 1, 1, 1)]]
            tmp_files[dd] = [file]

        # Fetch to datavect
        f = list_files_avail[indout]
        target_file = f"{target_dir}/{os.path.basename(f)}"
        path.link(f, target_file)

    return tmp_files, tmp_dates