Source code for pycif.plugins.datastreams.fluxes.tm5.fetch

import os
import datetime
import pandas as pd
import xarray as xr
from netCDF4 import Dataset
import numpy as np
from .....utils import path
from .....utils.hdf5 import _hdf5_lock



[docs]
def fetch(ref_dir, ref_file, date_interval, target_dir, tracer=None, **kwargs):
    """Fetch TM5 flux files and derive the validity time intervals they cover.

    Lists candidate files at `tracer.file_freq`; for each existing file,
    reads the ``time_start`` array of every emission source under the
    ``glb600x400`` region/species group (using `tracer.varname` or
    `tracer.orig_name`), builds ``[start, end]`` intervals between
    consecutive ``time_start`` values (the last interval extended to the
    end of `date_interval`), and links the file into `target_dir`.

    Args:
        ref_dir (str): Directory containing the reference input files.
        ref_file (str): Filename pattern of the input files (a ``strftime``
            format string).
        date_interval (list[datetime.datetime]): ``[date_i, date_f]``
            simulation interval to cover.
        target_dir (str): Directory where the resolved files are linked.
        tracer: The flux tracer plugin, providing ``file_freq`` and
            ``varname``/``orig_name`` (used to locate the species group in
            the file).

    Returns:
        tuple[dict, dict]: ``(list_files, list_dates)``, each keyed by the
        candidate file date, mapping to the list of resolved file paths and
        the list of ``[start, end]`` date-interval pairs found in those
        files.
    """
    list_period_dates = \
        pd.date_range(date_interval[0], date_interval[1],
                      freq=tracer.file_freq).to_pydatetime()
    list_dates = {}
    list_files = {}
    for dd in list_period_dates:
        file = dd.strftime(f"{ref_dir}/{ref_file}")
        if os.path.isfile(file):
            with _hdf5_lock:
                nc_id = Dataset(file, 'r')

                # List the sources present in the file
                sources = ['biomass-burning', 'rice', 'wetlands', 'other']
                region = 'glb600x400'
                name = tracer.varname if tracer.varname != "" else tracer.orig_name
                for i_source, source in enumerate(sources):
                    # Get the path to the current source and print it
                    source_path = region + '/' + name + '/' + source + '/'

                    nc_time_start = np.array(
                        [datetime.datetime(*bagger)
                         for bagger in nc_id[source_path + 'time_start']])

            list_dates[dd] = [
                [hh0, hh1] for hh0, hh1 in zip(nc_time_start[:-1], nc_time_start[1:])
            ]
            list_dates[dd].append([nc_time_start[-1], date_interval[-1]])
            
            # Fetching
            target_file = f"{target_dir}/{os.path.basename(file)}"
            path.link(file, target_file)
            list_files[dd] = (len(list_dates[dd]) * [target_file])
    
    return list_files, list_dates