Source code for pycif.utils.classes.obsparsers

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob
import os
from types import MethodType

import pandas as pd

from ...utils.check import errclass as error
from logging import info
from .baseclass import Plugin
from .setup import Setup



[docs]
class ObsParser(Plugin):
    """Class for handling time series parsing from different data providers
    and data file formats.

    """


[docs]
    def initiate_template(self):
        super(ObsParser, self).initiate_template(
            plg_type="obsparser",
            default_functions={
                "do_parse": True, "parse_multiple_files": True
                
            }
        )



[docs]
    @classmethod
    def get_parser(cls, plg):
        """Get the correct Parser for a provider and file_format_id

        Args:
            provider (str):  provider of the input file
            file_format_id (str): name of the type of file with a given format

        Returns:
            Parser: Parser for provider and file_format_id
        """
        parser = cls.load_registered(
            plg.provider, plg.format, "obsparser", plg_orig=plg
        )

        # Loading the parser
        toload = Setup(parser=parser)
        Setup.load_setup(toload, level=1)

        return toload.parser



[docs]
    @classmethod
    def register_parser(cls, provider, file_format_id, parse_module, **kwargs):
        """Register a parsing function for provider and format with default
        options

        Args:
            provider (str):  provider of the input file
            file_format_id (str): name of the type of file with a given format
            parse_module (Module):
                    returns file content
                    as pandas.DataFrame df[obssite_id, parameter]
            **kwargs: default options for parse_function

        Notes:
            The parse_function signature is the same as
            the :py:func:`Parser.parse_file`
        """
        super(ObsParser, cls).register_plugin(
            provider, file_format_id, parse_module, plugin_type="obsparser", **kwargs
        )



[docs]
    def parse_file(self, obs_file, **kwargs):
        """This function does the parsing (and post processing if necessary).

        Args:
            obs_file (str): path to input file

        Keyword Args:
            encoding (str): Encoding of input files
            freq (str): frequency after resampling
                        see `Offset Aliases`_ for valid strings
            src_freq (str):
                        explicit setting of the frequency in the input file
                        shouldn't be necessary

        Returns:
            pandas.DataFrame: renamed, shifted, resampled
            Dataframe df[obssite_id, parameter] with t as index
        """

        df = self.do_parse(obs_file, **kwargs)

        # Removing rows with only NaNs
        df = df.dropna(axis=0, how="all")

        # Checking that the returned dataframe has all required columns
        if self.check_df(df, **kwargs):
            return df



[docs]
    def parse_multiple_files(self, **kwargs):
        """Parses multiple files specified by a glob pattern and stores the
        content into a datastore.

        Args:
            self: the plugin with its describing arguments (in particular dir_obs)

        Returns:
            dict: {obs_file} = df[obssite_id, parameter]

        Note:
            By default, the function calls `self.parse_file`, which filters out
            NaNs and check that all required columns are available.


        """

        dfs = {}

        info("Reading files in " + self.dir_obs)
        for obs_file in sorted(glob.glob(self.dir_obs + "*")):
            try:
                dfs[os.path.basename(obs_file)] = self.parse_file(
                    obs_file, **kwargs
                )

            except error.PluginError as e:
                info(
                    "{} was not loaded for the following reason".format(
                        obs_file
                    )
                )
                info(e)

        if dfs != {}:
            return pd.concat(list(dfs.values()))
        else:
            return pd.DataFrame({})



[docs]
    @staticmethod
    def check_df(df, **kwargs):
        """Check that all required columns have been loaded

        :param df:
        :param kwargs:
        :return:
        """

        reqcols = ["station", "network", "parameter", "duration", "obserror"]
        cols = df.columns
        for c in reqcols:
            meta = "metadata"
            if c == "obserror":
                meta = "maindata"

            if c not in cols and (meta, c) not in cols:
                towrite = """
                    {} was not returned in the dataframe
                    Please check your parser definition
                    """.format(
                    c
                )
                raise error.PluginError(towrite)

        return True