Source code for pycif.plugins.obsparsers.verify.headers
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import
from .utils import remap_extract, find_header
import os
import glob
from ....utils import check as check
[docs]
def get_header(obs_file, maxlen):
"""Extract the header from a VERIFY format observation file
Args:
obs_file (str): path to input file
maxlen (int): abort after this amount of lines when reading header.
Default 300
Returns:
List[str]: List with all Lines of the Header
"""
# return empty if not a file
if not os.path.isfile(obs_file):
return []
with open(obs_file, "r") as input_file:
lines = []
nheader = 8
# Accepts formats with 'CXX' terminating the header
# or with the number of HEADER LINES specified explicitly in the header
for line in input_file:
lines.append(line.strip())
if len(lines) > maxlen:
break
if len(lines) > nheader:
break
if not lines:
return []
# if the number of line was not found, tries to find it
# on the first line of the document
return lines[:nheader]
[docs]
def parse_header(header, spec, list_extract,
default_unit='ppm',
default_tz='utc'):
"""Extract information from the header
Args:
header (list[str]): extracted header
spec (str): species to extract
list_extract (list[str]): list of parameters to return
'flag' to extract flag
'error' to extract observation error
any other parameter appearing in the columns
default_unit (str): default unit generally used to report this species
default_tz (str): default time zone for this file
Returns:
a 4-element tuple containing
- names (list[str]): list of columns names to extract
- columns (list[int]): list of column index to extract
- date_ids (list[int]): list of column ids for date information
- extra (dict): extra information contained in the header and not
in the body of the file, e.g., altitude, coordinates, unit, etc.
"""
# Minimize all characters to facilitate comparisons
head = [s.lower() for s in header[-1].split()]
# Parsing time information
try:
date_ids = [head.index('year'), head.index('month'), head.index('day')]
except:
print(header)
print(head)
raise ValueError("Cant find a date in this VERIFY file. " \
"Please check format")
if 'hour' in head:
date_ids += [head.index('hour'), head.index('minute'), head.index('second')]
# Getting other parameters using the utils.remap_extract function
columns = []
names = []
extra = {}
for id_extract in list_extract:
try:
# First look into columns names
columns.append(head.index(remap_extract(id_extract)))
names.append(id_extract.lower())
except:
try:
# Some files have a name with CH4_Air instead of CH4
columns.append(
head.index(
remap_extract(id_extract) + '_air'))
names.append(id_extract.lower())
except:
try:
# Look into the header
id_value = find_header(id_extract, header)
extra[id_extract.lower()] = id_value
except Exception as e:
# If cannot find,
# assume default values for unit and timezone
check.verbose("Cant extract " + id_extract)
if id_extract == 'units':
extra[id_extract] = default_unit
elif id_extract == 'time':
extra[id_extract] = default_tz
return names, columns, date_ids, extra