Source code for pycif.utils.classes.obsparsers
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import glob
import os
from types import MethodType
import pandas as pd
from ...utils.check import errclass as error
from logging import info
from .baseclass import Plugin
from .setup import Setup
[docs]
class ObsParser(Plugin):
"""Plugin type for handling time series parsing from different data providers
and data file formats.
Concrete implementations live in ``pycif/plugins/obsparsers/``.
"""
[docs]
def initiate_template(self):
"""Initialise the ObsParser plugin template.
Loads the registered obs-parser module and attaches ``do_parse`` and
``parse_multiple_files`` as bound methods on this instance.
"""
super(ObsParser, self).initiate_template(
plg_type="obsparser",
default_functions={
"do_parse": True, "parse_multiple_files": True
}
)
[docs]
@classmethod
def get_parser(cls, plg):
"""Get the correct Parser for a provider and file_format_id
Args:
provider (str): provider of the input file
file_format_id (str): name of the type of file with a given format
Returns:
Parser: Parser for provider and file_format_id
"""
parser = cls.load_registered(
plg.provider, plg.format, "obsparser", plg_orig=plg
)
# Loading the parser
toload = Setup(parser=parser)
Setup.load_setup(toload, level=1)
return toload.parser
[docs]
@classmethod
def register_parser(cls, provider, file_format_id, parse_module, **kwargs):
"""Register a parsing function for provider and format with default
options
Args:
provider (str): provider of the input file
file_format_id (str): name of the type of file with a given format
parse_module (Module):
returns file content
as pandas.DataFrame df[obssite_id, parameter]
**kwargs: default options for parse_function
Notes:
The parse_function signature is the same as
the :py:func:`Parser.parse_file`
"""
super(ObsParser, cls).register_plugin(
provider, file_format_id, parse_module, plugin_type="obsparser", **kwargs
)
[docs]
def parse_file(self, obs_file, **kwargs):
"""This function does the parsing (and post processing if necessary).
Args:
obs_file (str): path to input file
Keyword Args:
encoding (str): Encoding of input files
freq (str): frequency after resampling; see `Offset Aliases
<https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases>`_
for valid strings
src_freq (str):
explicit setting of the frequency in the input file
shouldn't be necessary
Returns:
pandas.DataFrame: renamed, shifted, resampled
Dataframe df[obssite_id, parameter] with t as index
"""
df = self.do_parse(obs_file, **kwargs)
# Removing rows with only NaNs
df = df.dropna(axis=0, how="all")
# Checking that the returned dataframe has all required columns
if self.check_df(df, **kwargs):
return df
[docs]
def parse_multiple_files(self, **kwargs):
"""Parses multiple files specified by a glob pattern and stores the
content into a datastore.
Args:
self: the plugin with its describing arguments (in particular dir_obs)
Returns:
dict: {obs_file} = df[obssite_id, parameter]
Note:
By default, the function calls `self.parse_file`, which filters out
NaNs and check that all required columns are available.
"""
dfs = {}
info("Reading files in " + self.dir_obs)
for obs_file in sorted(glob.glob(self.dir_obs + "*")):
try:
dfs[os.path.basename(obs_file)] = self.parse_file(
obs_file, **kwargs
)
except error.PluginError as e:
info(
f"{obs_file} was not loaded for the following reason"
)
info(e)
if dfs != {}:
return pd.concat(list(dfs.values()))
else:
return pd.DataFrame({})
[docs]
@staticmethod
def check_df(df, **kwargs):
"""Check that a parsed DataFrame contains all required columns.
Args:
df (pd.DataFrame): DataFrame returned by the parser.
**kwargs: Accepted for compatibility; not used.
Returns:
bool: True if all required columns are present.
Raises:
PluginError: If any of ``station``, ``network``, ``parameter``,
``duration`` or ``obserror`` columns are missing.
"""
reqcols = ["station", "network", "parameter", "duration", "obserror"]
cols = df.columns
for c in reqcols:
meta = "metadata"
if c == "obserror":
meta = "maindata"
if c not in cols and (meta, c) not in cols:
towrite = f"""
{c} was not returned in the dataframe
Please check your parser definition
"""
raise error.PluginError(towrite)
return True