Source code for pycif.plugins.obsparsers.verify.utils

# -*- coding: utf-8 -*-

import os
from dateutil.tz import tzoffset
import numpy as np
import pandas as pd


[docs] def remap_head(s): """Adapts names to extract values in VERIFY header """ if s.lower() == 'lat': return 'latitude:' elif s.lower() == 'lon': return 'longitude:' elif s.lower() == 'alt': return 'altitude:' elif s.lower() == 'unit': return 'measurement unit' elif s.lower() == 'tz': return 'time zone' else: return s.lower()
[docs] def find_header(id_extract, header): """Finds the value of a constant parameter (e.g., latitude, altitude, etc.) in the header of a file """ for ln in header: if remap_head(id_extract) in ln.lower(): # value = string.split(ln.lower(), ':')[1].strip() value = ln.lower().split(':')[1].strip() try: return float(value) except ValueError: return value raise ValueError("Couldn't extract {}".format(id_extract))
[docs] def rescale(obs_file, header): """Finds out on what scale the measurement was reported and returns the corresponding scaling factor. Notes: If the scale is not in the list of recognized scales, then returns a NaN to kill the data """ try: scale = find_header('scale', header) except: scale = parse_file(obs_file)['provider'] if '04' in scale or 'wmo' in scale.lower(): return 1. elif scale == 'CSIRO94': return 1.01219 elif 'NIST' in scale or 'USA' in scale: return 0.998 elif 'tohoku' in scale.lower(): return 1.0003 elif 'aircore' in scale.lower(): return 1.0124 elif scale == 'Manufacture\'s': return 0.997 elif 'NIES' in scale: return 0.997 # If the scale is not known, then returns NaN return np.nan
[docs] def parse_file(obs_file): """Parses VERIFY file name and extract corresponding information. """ filesplit = os.path.basename(obs_file).split('.') infos = {} infos['parameter'], infos['stat'], infos['provider'], _ = filesplit[ 0].split('_') # infos['stat'] = filesplit[0][:3] # infos['provider'] = filesplit[1].replace('_', '-') # infos['site category'] = filesplit[2] # infos['obs type'] = filesplit[-5] # infos['parameter'] = filesplit[-4] # infos['freq'] = filesplit[-3] return infos
[docs] def convert_unit(df, params, unit='ppb', default_unit='ppb'): """Converts between ppb, ppm, ppt. Default is conversion to ppm """ if 'unit' in df.columns: for p in params: # Change missing unit to default unit df.loc[df['unit'] == '', 'unit'] = default_unit # First conversion to ppm as a common reference unit df.loc[df['unit'] == 'ppt', p] /= 1e6 df.loc[df['unit'] == 'ppb', p] /= 1e3 df.loc[df['unit'] == 'ppbv', p] /= 1e3 df.loc[df['unit'] == 'nmol.mol-1', p] /= 1e3 df.loc[df['unit'] == 'nmol.mol-¹', p] /= 1e3 # Then conversion to target unit if needed if unit in ['ppb', 'ppbv', 'nmol.mol-1', 'nmol.mol-¹']: df[p] *= 1e3 elif unit == 'ppt': df[p] *= 1e6 elif unit == 'ppm': pass else: raise ValueError(unit + " is not a valid unit for conversion") df['unit'] = unit return df
[docs] def shiftdate(dates, tz): """Shifts dates according to a time zone as define in WDCCGG files """ if tz in ['utc', 'utc+0', 'utca+0', 'utc +0', '', None]: return dates utc_code = [w for w in tz.split() if 'utc' in w][0] shift = int(utc_code[4:]) * (-1 + 2 * (utc_code[3] == '+')) tzlocal = tzoffset('local', 60 * 60 * shift) dates = dates.tz_localize(tzlocal) dates = dates.tz_convert('UTC') dates = dates.tz_localize(None) return dates
[docs] def remap_extract(s): """Adapts names to extract columns in the context of WDCGG """ if s.lower() == 'obserror': return 'sd' elif s.lower() == 'flag': return 'f' elif s.lower() == 'mcf': return 'ch3ccl3' else: return s.lower()
[docs] def average(df, mountain_site=[]): """ Average and keep only afternoon values. For some mountain sites, average and keep only night values. """ # Store some columns which are gonna be deleted cols_name = ['unit', 'station', 'network', 'parameter'] cols = [] for col in cols_name: if len(np.unique(df[col])) > 1: print('There is more than 1 %s in file' % col) raise ValueError cols.append(df[col][0]) # Select afternoon or night values station = cols[1] # if station in flask or station.upper() in flask: if np.any(df['network'] == 'flask'): beg = 0 end = 24 shift = None print('\t averaging flask measurement at station', station) elif station in mountain_site or station.upper() in mountain_site: beg = 0 end = 3 shift = 3 print('\t averaging station', station, 'at night') else: beg = 12 end = 15 shift = 15 print('\t averaging station', station, 'in the afternoon') df['hour'] = df.index.hour df['obs_error_sq'] = df['obserror'] ** 2 sub_df = df.loc[(df.index.hour >= beg) & (df.index.hour < end)] # Average afternoon values and obs error gb = sub_df.groupby(sub_df.index.date) count = gb.count() average = gb.mean() yobserr = np.sqrt(average['obs_error_sq']) / np.sqrt(count['obserror']) average['obserror'] = yobserr # Reset to the structure it had initially final_df = average.copy() for name, col in zip(cols_name, cols): final_df[name] = col final_df = final_df[df.columns] final_df.index.name = df.index.name final_df.index = pd.to_datetime(final_df.index) if shift is None: shift = np.ceil(average['hour'] / 3.) * 3 final_df.index = final_df.index.shift(shift, 'H') else: final_df.index = final_df.index.shift(shift, 'H') return final_df