Source code for pycif.utils.classes.obsparsers

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob
import os
from types import MethodType

import pandas as pd

from ...utils.check import errclass as error
from logging import info
from .baseclass import Plugin
from .setup import Setup


[docs]class ObsParser(Plugin): """Class for handling time series parsing from different data providers and data file formats. """
[docs] def initiate_template(self): super(ObsParser, self).initiate_template( plg_type="obsparser", default_functions={ "do_parse": True, "parse_multiple_files": True } )
[docs] @classmethod def get_parser(cls, plg): """Get the correct Parser for a provider and file_format_id Args: provider (str): provider of the input file file_format_id (str): name of the type of file with a given format Returns: Parser: Parser for provider and file_format_id """ parser = cls.load_registered( plg.provider, plg.format, "obsparser", plg_orig=plg ) # Loading the parser toload = Setup(parser=parser) Setup.load_setup(toload, level=1) return toload.parser
[docs] @classmethod def register_parser(cls, provider, file_format_id, parse_module, **kwargs): """Register a parsing function for provider and format with default options Args: provider (str): provider of the input file file_format_id (str): name of the type of file with a given format parse_module (Module): returns file content as pandas.DataFrame df[obssite_id, parameter] **kwargs: default options for parse_function Notes: The parse_function signature is the same as the :py:func:`Parser.parse_file` """ super(ObsParser, cls).register_plugin( provider, file_format_id, parse_module, plugin_type="obsparser", **kwargs )
[docs] def parse_file(self, obs_file, **kwargs): """This function does the parsing (and post processing if necessary). Args: obs_file (str): path to input file Keyword Args: encoding (str): Encoding of input files freq (str): frequency after resampling see `Offset Aliases`_ for valid strings src_freq (str): explicit setting of the frequency in the input file shouldn't be necessary Returns: pandas.DataFrame: renamed, shifted, resampled Dataframe df[obssite_id, parameter] with t as index """ df = self.do_parse(obs_file, **kwargs) # Removing rows with only NaNs df = df.dropna(axis=0, how="all") # Checking that the returned dataframe has all required columns if self.check_df(df, **kwargs): return df
[docs] def parse_multiple_files(self, **kwargs): """Parses multiple files specified by a glob pattern and stores the content into a datastore. Args: self: the plugin with its describing arguments (in particular dir_obs) Returns: dict: {obs_file} = df[obssite_id, parameter] Note: By default, the function calls `self.parse_file`, which filters out NaNs and check that all required columns are available. """ dfs = {} info("Reading files in " + self.dir_obs) for obs_file in sorted(glob.glob(self.dir_obs + "*")): try: dfs[os.path.basename(obs_file)] = self.parse_file( obs_file, **kwargs ) except error.PluginError as e: info( "{} was not loaded for the following reason".format( obs_file ) ) info(e) if dfs != {}: return pd.concat(list(dfs.values())) else: return pd.DataFrame({})
[docs] @staticmethod def check_df(df, **kwargs): """Check that all required columns have been loaded :param df: :param kwargs: :return: """ reqcols = ["station", "network", "parameter", "duration", "obserror"] cols = df.columns for c in reqcols: meta = "metadata" if c == "obserror": meta = "maindata" if c not in cols and (meta, c) not in cols: towrite = """ {} was not returned in the dataframe Please check your parser definition """.format( c ) raise error.PluginError(towrite) return True