Source code for pycif.plugins.obsparsers.verify.utils
# -*- coding: utf-8 -*-
import os
from dateutil.tz import tzoffset
import numpy as np
import pandas as pd
[docs]
def remap_head(s):
"""Adapts names to extract values in VERIFY header
"""
if s.lower() == 'lat':
return 'latitude:'
elif s.lower() == 'lon':
return 'longitude:'
elif s.lower() == 'alt':
return 'altitude:'
elif s.lower() == 'unit':
return 'measurement unit'
elif s.lower() == 'tz':
return 'time zone'
else:
return s.lower()
[docs]
def rescale(obs_file, header):
"""Finds out on what scale the measurement was reported and returns the
corresponding scaling factor.
Notes:
If the scale is not in the list of recognized scales, then returns a
NaN to kill the data
"""
try:
scale = find_header('scale', header)
except:
scale = parse_file(obs_file)['provider']
if '04' in scale or 'wmo' in scale.lower():
return 1.
elif scale == 'CSIRO94':
return 1.01219
elif 'NIST' in scale or 'USA' in scale:
return 0.998
elif 'tohoku' in scale.lower():
return 1.0003
elif 'aircore' in scale.lower():
return 1.0124
elif scale == 'Manufacture\'s':
return 0.997
elif 'NIES' in scale:
return 0.997
# If the scale is not known, then returns NaN
return np.nan
[docs]
def parse_file(obs_file):
"""Parses VERIFY file name and extract corresponding information.
"""
filesplit = os.path.basename(obs_file).split('.')
infos = {}
infos['parameter'], infos['stat'], infos['provider'], _ = filesplit[
0].split('_')
# infos['stat'] = filesplit[0][:3]
# infos['provider'] = filesplit[1].replace('_', '-')
# infos['site category'] = filesplit[2]
# infos['obs type'] = filesplit[-5]
# infos['parameter'] = filesplit[-4]
# infos['freq'] = filesplit[-3]
return infos
[docs]
def convert_unit(df, params, unit='ppb', default_unit='ppb'):
"""Converts between ppb, ppm, ppt. Default is conversion to ppm
"""
if 'unit' in df.columns:
for p in params:
# Change missing unit to default unit
df.loc[df['unit'] == '', 'unit'] = default_unit
# First conversion to ppm as a common reference unit
df.loc[df['unit'] == 'ppt', p] /= 1e6
df.loc[df['unit'] == 'ppb', p] /= 1e3
df.loc[df['unit'] == 'ppbv', p] /= 1e3
df.loc[df['unit'] == 'nmol.mol-1', p] /= 1e3
df.loc[df['unit'] == 'nmol.mol-¹', p] /= 1e3
# Then conversion to target unit if needed
if unit in ['ppb', 'ppbv', 'nmol.mol-1', 'nmol.mol-¹']:
df[p] *= 1e3
elif unit == 'ppt':
df[p] *= 1e6
elif unit == 'ppm':
pass
else:
raise ValueError(unit + " is not a valid unit for conversion")
df['unit'] = unit
return df
[docs]
def shiftdate(dates, tz):
"""Shifts dates according to a time zone as define in WDCCGG files
"""
if tz in ['utc', 'utc+0', 'utca+0', 'utc +0', '', None]:
return dates
utc_code = [w for w in tz.split() if 'utc' in w][0]
shift = int(utc_code[4:]) * (-1 + 2 * (utc_code[3] == '+'))
tzlocal = tzoffset('local', 60 * 60 * shift)
dates = dates.tz_localize(tzlocal)
dates = dates.tz_convert('UTC')
dates = dates.tz_localize(None)
return dates
[docs]
def average(df, mountain_site=[]):
"""
Average and keep only afternoon values.
For some mountain sites, average and keep only night values.
"""
# Store some columns which are gonna be deleted
cols_name = ['unit', 'station', 'network', 'parameter']
cols = []
for col in cols_name:
if len(np.unique(df[col])) > 1:
print('There is more than 1 %s in file' % col)
raise ValueError
cols.append(df[col][0])
# Select afternoon or night values
station = cols[1]
# if station in flask or station.upper() in flask:
if np.any(df['network'] == 'flask'):
beg = 0
end = 24
shift = None
print('\t averaging flask measurement at station', station)
elif station in mountain_site or station.upper() in mountain_site:
beg = 0
end = 3
shift = 3
print('\t averaging station', station, 'at night')
else:
beg = 12
end = 15
shift = 15
print('\t averaging station', station, 'in the afternoon')
df['hour'] = df.index.hour
df['obs_error_sq'] = df['obserror'] ** 2
sub_df = df.loc[(df.index.hour >= beg) & (df.index.hour < end)]
# Average afternoon values and obs error
gb = sub_df.groupby(sub_df.index.date)
count = gb.count()
average = gb.mean()
yobserr = np.sqrt(average['obs_error_sq']) / np.sqrt(count['obserror'])
average['obserror'] = yobserr
# Reset to the structure it had initially
final_df = average.copy()
for name, col in zip(cols_name, cols):
final_df[name] = col
final_df = final_df[df.columns]
final_df.index.name = df.index.name
final_df.index = pd.to_datetime(final_df.index)
if shift is None:
shift = np.ceil(average['hour'] / 3.) * 3
final_df.index = final_df.index.shift(shift, 'H')
else:
final_df.index = final_df.index.shift(shift, 'H')
return final_df