Source code for pycif.plugins.datastreams.fluxes.flux_plugin_template.fetch

import datetime
import glob
import os
import pandas as pd

import numpy as np

from .....utils import path
from logging import debug


[docs]def fetch(ref_dir, ref_file, input_dates, target_dir,
          tracer=None, component=None, **kwargs):
    """
    Fetch files and dates for the given simulation interval.
    Determine what dates are available in the input data within the simulation interval.
    Link reference files to the working directory to avoid interactions with the outer
    world.
    
    Should include input data dates encompassing the simulation interval, which means
    that, e.g, if input data are at the monthly scale and the simulation interval
    starts on 2010-01-15 to 2010-03-15, the output should at least include the input
    data dates for 2010-01, 2010-02 and 2010-03.
    
    Note:
        The three main arguments (:bash:`ref_dir`, :bash:`ref_file` and :bash:`file freq`) can either be
        defined as :bash:`dir`, :bash:`file` and :bash:`file_freq` respectively
        in the relevant davavect/flux/my_spec paragrah in the yaml,
        or, if not available, they are fetched from the corresponding components/flux paragraph.
        If one of the three needs to have a default value, it can be
        integrated in the input_arguments dictionary in :bash:`__init__.py`

    Args:
        ref_dir (str): the path to the input files
        ref_file (str): format of the input files
        input_dates (list): simulation interval (start and end dates)
        target_dir (str): where to copy
        tracer: the tracer Plugin, corresponding to the paragraph
            :bash:`datavect/components/fluxes/parameters/my_species` in the
            configuration yaml; can be needed to fetch extra information
            given by the user
        component: the component Plugin, same as tracer; corresponds to the paragraph
            :bash:`datavect/components/fluxes` in the configuration yaml
    
    Return:
        (dict, dict): returns two dictionaries: list_files and list_dates

        list_files: for each date that begins a period, a list containing
            the names of the files that are available for the dates within this period
        list_dates: for each date that begins a period, a list containing
            the date intervals (in the form of a list of two dates each)
            matching the files listed in list_files

    Note:
        The output format can be illustrated as follows (the dates are shown as strings,
        but datetime.datetime objects are expected):

        .. code-block:: python

          list_dates = {
              "2019-01-01 00:00":
                  [["2019-01-01 00:00", "2019-01-01 03:00"],
                   ["2019-01-01 03:00", "2019-01-01 06:00"],
                   ["2019-01-01 06:00", "2019-01-01 09:00"],
                   ["2019-01-01 09:00", "2019-01-01 12:00"]],
              "2019-01-01 12:00":
                  [["2019-01-01 12:00", "2019-01-01 15:00"],
                   ["2019-01-01 15:00", "2019-01-01 18:00"],
                   ["2019-01-01 18:00", "2019-01-01 21:00"],
                   ["2019-01-01 21:00", "2019-01-02 00:00"]]
          }

          list_files = {
              "2019-01-01 00:00":
                  ["path_to_file_for_20190101_0000",
                   "path_to_file_for_20190101_0300",
                   "path_to_file_for_20190101_0600",
                   "path_to_file_for_20190101_0900"],
              "2019-01-01 12:00":
                  ["path_to_file_for_20190101_1200",
                   "path_to_file_for_20190101_1500",
                   "path_to_file_for_20190101_1800",
                   "path_to_file_for_20190101_2100"]
          }

        In the example above, the native temporal resolution is 3-hourly,
        and files are available every 12 hours

    Note:
        There is no specific rule for sorting dates and files into separate keys of
        the output dictionaries. The usage rule would be to have one dictionary key
        per input file, therein unfolding all available dates in the corresponding file;
        in that rule, the content of :bash:`list_files` is a duplicate of the same file over again
        in every given key of the dictionary.

        But any combination of the keys is valid as long as the list of dates of each key corresponds
        exactly to the file with the same index.
        Hence, it is acceptable to have, e.g., one key with all dates and files,
        or one key per date even though there are several date per file.

        The balance between the number of keys and the size of each key should be
        determined by the standard usage expected with the data.
        overall, a good practice is to have one key in the input data for each
        sub-simulation for which it will be used afterwards by the model.

        For instance, CHIMERE emission files store hourly emissions for CHIMERE
        sub-simulations, typically 24-hour long. It thus makes sense to have
        one key per 24-hour period and in each key the hour emissions.


    """
    
    debug("Fetching files with the following information: \n"
          "- datei/datef = {}\n"
          "- dir = {}\n"
          "- file = {}\n"
          "- file_freq = {}\n\n"
          "These three main arguments can either be defined in the relevant flux/my_spec "
          "paragrah in the yaml, or, if not available, they are fetched from the "
          "corresponding components/flux paragraph.\n"
          "If one of the three needs to have a default value, it can be "
          "integrated in the input_arguments dictionary in __init__.py for {}".format(
        input_dates, ref_dir, ref_file, tracer.file_freq, __package__
    ))
    
    list_period_dates = \
        pd.date_range(input_dates[0], input_dates[1],
                      freq=tracer.file_freq)
    list_dates = {}
    list_files = {}
    for dd in list_period_dates:
        file = dd.strftime("{}/{}".format(ref_dir, ref_file))
        file_hours = pd.date_range(
            dd, dd + pd.to_timedelta(tracer.file_freq), freq="1H")
        list_dates[dd] = [[hh, hh + datetime.timedelta(hours=1)]
                          for hh in file_hours]
        list_files[dd] = (len(file_hours) * [file])
        
        # Fetching
        if os.path.isfile(file):
            target_file = "{}/{}".format(target_dir, os.path.basename(file))
            path.link(file, target_file)
    
    debug(
        "Fetched files and dates as follows:\n"
        "Dates: {\n" +
        "\n".join(["\n".join(["  {}:".format(ddi)]
                             + ["     {}".format(dd)
                                for dd in list_dates[ddi]])
                   for ddi in list_dates])
        + "\n}\n\n" +
        "Files: {\n" +
        "\n".join(["\n".join(["  {}:".format(ddi)]
                             + ["     {}".format(dd)
                                for dd in list_files[ddi]])
                   for ddi in list_files])
        + "\n}"
    )
    
    return list_files, list_dates