Source code for pycif.utils.classes.obsparsers
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob
import os
from types import MethodType
import pandas as pd
from ...utils.check import errclass as error
from logging import info
from .baseclass import Plugin
from .setup import Setup
[docs]
class ObsParser(Plugin):
"""Class for handling time series parsing from different data providers
and data file formats.
"""
def initiate_template(self):
super(ObsParser, self).initiate_template(
plg_type="obsparser",
default_functions={
"do_parse": True, "parse_multiple_files": True
}
)
@classmethod
def get_parser(cls, plg):
"""Get the correct Parser for a provider and file_format_id
Args:
provider (str): provider of the input file
file_format_id (str): name of the type of file with a given format
Returns:
Parser: Parser for provider and file_format_id
"""
parser = cls.load_registered(
plg.provider, plg.format, "obsparser", plg_orig=plg
)
# Loading the parser
toload = Setup(parser=parser)
Setup.load_setup(toload, level=1)
return toload.parser
@classmethod
def register_parser(cls, provider, file_format_id, parse_module, **kwargs):
"""Register a parsing function for provider and format with default
options
Args:
provider (str): provider of the input file
file_format_id (str): name of the type of file with a given format
parse_module (Module):
returns file content
as pandas.DataFrame df[obssite_id, parameter]
**kwargs: default options for parse_function
Notes:
The parse_function signature is the same as
the :py:func:`Parser.parse_file`
"""
super(ObsParser, cls).register_plugin(
provider, file_format_id, parse_module, plugin_type="obsparser", **kwargs
)
def parse_file(self, obs_file, **kwargs):
"""This function does the parsing (and post processing if necessary).
Args:
obs_file (str): path to input file
Keyword Args:
encoding (str): Encoding of input files
freq (str): frequency after resampling
see `Offset Aliases`_ for valid strings
src_freq (str):
explicit setting of the frequency in the input file
shouldn't be necessary
Returns:
pandas.DataFrame: renamed, shifted, resampled
Dataframe df[obssite_id, parameter] with t as index
"""
df = self.do_parse(obs_file, **kwargs)
# Removing rows with only NaNs
df = df.dropna(axis=0, how="all")
# Checking that the returned dataframe has all required columns
if self.check_df(df, **kwargs):
return df
[docs]
def parse_multiple_files(self, **kwargs):
"""Parses multiple files specified by a glob pattern and stores the
content into a datastore.
Args:
self: the plugin with its describing arguments (in particular dir_obs)
Returns:
dict: {obs_file} = df[obssite_id, parameter]
Note:
By default, the function calls `self.parse_file`, which filters out
NaNs and check that all required columns are available.
"""
dfs = {}
info("Reading files in " + self.dir_obs)
for obs_file in sorted(glob.glob(self.dir_obs + "*")):
try:
dfs[os.path.basename(obs_file)] = self.parse_file(
obs_file, **kwargs
)
except error.PluginError as e:
info(
"{} was not loaded for the following reason".format(
obs_file
)
)
info(e)
if dfs != {}:
return pd.concat(list(dfs.values()))
else:
return pd.DataFrame({})
@staticmethod
def check_df(df, **kwargs):
"""Check that all required columns have been loaded
:param df:
:param kwargs:
:return:
"""
reqcols = ["station", "network", "parameter", "duration", "obserror"]
cols = df.columns
for c in reqcols:
meta = "metadata"
if c == "obserror":
meta = "maindata"
if c not in cols and (meta, c) not in cols:
towrite = """
{} was not returned in the dataframe
Please check your parser definition
""".format(
c
)
raise error.PluginError(towrite)
return True