Source code for pycif.plugins.obsparsers.verify.utils

# -*- coding: utf-8 -*-

import os
from dateutil.tz import tzoffset
import numpy as np
import pandas as pd
from ....utils.check.errclass import CifValueError



[docs]
def remap_head(s):
    """Adapts names to extract values in VERIFY header

    """

    if s.lower() == 'lat':
        return 'latitude:'

    elif s.lower() == 'lon':
        return 'longitude:'

    elif s.lower() == 'alt':
        return 'altitude:'

    elif s.lower() == 'unit':
        return 'measurement unit'

    elif s.lower() == 'tz':
        return 'time zone'

    else:
        return s.lower()




[docs]
def find_header(id_extract, header):
    """Finds the value of a constant parameter (e.g., latitude, altitude, etc.)
    in the header of a file

    """

    for ln in header:
        if remap_head(id_extract) in ln.lower():
            #            value = string.split(ln.lower(), ':')[1].strip()
            value = ln.lower().split(':')[1].strip()
            try:
                return float(value)
            except ValueError:
                return value

    raise CifValueError(f"Couldn't extract {id_extract}")




[docs]
def rescale(obs_file, header):
    """Finds out on what scale the measurement was reported and returns the
    corresponding scaling factor.

    Notes:
        If the scale is not in the list of recognized scales, then returns a
        NaN to kill the data

    """

    try:
        scale = find_header('scale', header)

    except:
        scale = parse_file(obs_file)['provider']

    if '04' in scale or 'wmo' in scale.lower():
        return 1.

    elif scale == 'CSIRO94':
        return 1.01219

    elif 'NIST' in scale or 'USA' in scale:
        return 0.998

    elif 'tohoku' in scale.lower():
        return 1.0003

    elif 'aircore' in scale.lower():
        return 1.0124

    elif scale == 'Manufacture\'s':
        return 0.997

    elif 'NIES' in scale:
        return 0.997

    # If the scale is not known, then returns NaN
    return np.nan




[docs]
def parse_file(obs_file):
    """Parses VERIFY file name and extract corresponding information.


    """
    filesplit = os.path.basename(obs_file).split('.')

    infos = {}
    infos['parameter'], infos['stat'], infos['provider'], _ = filesplit[
        0].split('_')
    # infos['stat'] = filesplit[0][:3]
    # infos['provider'] = filesplit[1].replace('_', '-')
    # infos['site category'] = filesplit[2]
    # infos['obs type'] = filesplit[-5]
    # infos['parameter'] = filesplit[-4]
    # infos['freq'] = filesplit[-3]

    return infos




[docs]
def convert_unit(df, params, unit='ppb', default_unit='ppb'):
    """Converts between ppb, ppm, ppt. Default is conversion to ppm

    """

    if 'unit' in df.columns:
        for p in params:
            # Change missing unit to default unit
            df.loc[df['unit'] == '', 'unit'] = default_unit

            # First conversion to ppm as a common reference unit
            df.loc[df['unit'] == 'ppt', p] /= 1e6
            df.loc[df['unit'] == 'ppb', p] /= 1e3
            df.loc[df['unit'] == 'ppbv', p] /= 1e3
            df.loc[df['unit'] == 'nmol.mol-1', p] /= 1e3
            df.loc[df['unit'] == 'nmol.mol-¹', p] /= 1e3

            # Then conversion to target unit if needed
            if unit in ['ppb', 'ppbv', 'nmol.mol-1', 'nmol.mol-¹']:
                df[p] *= 1e3
            elif unit == 'ppt':
                df[p] *= 1e6
            elif unit == 'ppm':
                pass
            else:
                raise CifValueError(unit + " is not a valid unit for conversion")

    df['unit'] = unit

    return df




[docs]
def shiftdate(dates, tz):
    """Shifts dates according to a time zone as define in WDCCGG files

    """

    if tz in ['utc', 'utc+0', 'utca+0', 'utc +0', '', None]:
        return dates

    utc_code = [w for w in tz.split() if 'utc' in w][0]

    shift = int(utc_code[4:]) * (-1 + 2 * (utc_code[3] == '+'))

    tzlocal = tzoffset('local', 60 * 60 * shift)
    dates = dates.tz_localize(tzlocal)
    dates = dates.tz_convert('UTC')
    dates = dates.tz_localize(None)

    return dates




[docs]
def remap_extract(s):
    """Adapts names to extract columns in the context of WDCGG

    """

    if s.lower() == 'obserror':
        return 'sd'

    elif s.lower() == 'flag':
        return 'f'

    elif s.lower() == 'mcf':
        return 'ch3ccl3'

    else:
        return s.lower()




[docs]
def average(df, mountain_site=[]):
    """
    Average and keep only afternoon values.
    For some mountain sites, average and keep only night values.
    """
    # Store some columns which are gonna be deleted
    cols_name = ['unit', 'station', 'network', 'parameter']
    cols = []
    for col in cols_name:
        if len(np.unique(df[col])) > 1:
            print(f'There is more than 1 {col} in file')
            raise CifValueError
        cols.append(df[col][0])

    # Select afternoon or night values
    station = cols[1]
    # if station in flask or station.upper() in flask:
    if np.any(df['network'] == 'flask'):
        beg = 0
        end = 24
        shift = None
        print('\t averaging flask measurement at station', station)
    elif station in mountain_site or station.upper() in mountain_site:
        beg = 0
        end = 3
        shift = 3
        print('\t averaging station', station, 'at night')

    else:
        beg = 12
        end = 15
        shift = 15
        print('\t averaging station', station, 'in the afternoon')

    df['hour'] = df.index.hour
    df['obs_error_sq'] = df['obserror'] ** 2

    sub_df = df.loc[(df.index.hour >= beg) & (df.index.hour < end)]

    # Average afternoon values and obs error
    gb = sub_df.groupby(sub_df.index.date)
    count = gb.count()

    average = gb.mean()
    yobserr = np.sqrt(average['obs_error_sq']) / np.sqrt(count['obserror'])
    average['obserror'] = yobserr

    # Reset to the structure it had initially
    final_df = average.copy()
    for name, col in zip(cols_name, cols):
        final_df[name] = col
    final_df = final_df[df.columns]
    final_df.index.name = df.index.name
    final_df.index = pd.to_datetime(final_df.index)
    if shift is None:
        shift = np.ceil(average['hour'] / 3.) * 3
        final_df.index = final_df.index.shift(shift, 'H')
    else:
        final_df.index = final_df.index.shift(shift, 'H')

    return final_df