Source code for pycif.plugins.obsparsers.verify.headers

# -*- coding: utf-8 -*-

from __future__ import print_function
from __future__ import absolute_import
from .utils import remap_extract, find_header
import os
import glob

from ....utils import check as check
from ....utils.check.errclass import CifValueError



[docs]
def get_header(obs_file, maxlen):
    """Extract the header from a VERIFY format observation file

    Args:
        obs_file (str): path to input file
        maxlen (int): abort after this amount of lines when reading header.
                      Default 300

    Returns:
        List[str]: List with all Lines of the Header

    """
    # return empty if not a file
    if not os.path.isfile(obs_file):
        return []

    with open(obs_file, "r") as input_file:
        lines = []
        nheader = 8
        
        # Accepts formats with 'CXX' terminating the header
        # or with the number of HEADER LINES specified explicitly in the header
        for line in input_file:
            lines.append(line.strip())
            
            if len(lines) > maxlen:
                break

            if len(lines) > nheader:
                break
        
        if not lines:
            return []
        
        # if the number of line was not found, tries to find it
        # on the first line of the document

        return lines[:nheader]




[docs]
def parse_header(header, spec, list_extract,
                 default_unit='ppm',
                 default_tz='utc'):
    """Extract information from the header

    Args:
        header (list[str]): extracted header
        spec (str): species to extract
        list_extract (list[str]): list of parameters to return
                                  'flag' to extract flag
                                  'error' to extract observation error
                                  any other parameter appearing in the columns
        default_unit (str): default unit generally used to report this species
        default_tz (str): default time zone for this file
        
    Returns:
        a 4-element tuple containing
            - names (list[str]): list of columns names to extract
            - columns (list[int]): list of column index to extract
            - date_ids (list[int]): list of column ids for date information
            - extra (dict): extra information contained in the header and not
                in the body of the file, e.g., altitude, coordinates, unit, etc.
    
    """
    # Minimize all characters to facilitate comparisons
    head = [s.lower() for s in header[-1].split()]

    # Parsing time information
    try:
        date_ids = [head.index('year'), head.index('month'), head.index('day')]

    except:
        print(header)
        print(head)
        raise CifValueError("Cant find a date in this VERIFY file. " \
                         "Please check format")
    
    if 'hour' in head:
        date_ids += [head.index('hour'), head.index('minute'), head.index('second')]
    
    # Getting other parameters using the utils.remap_extract function
    columns = []
    names = []
    
    extra = {}

    for id_extract in list_extract:
        try:
            # First look into columns names
            columns.append(head.index(remap_extract(id_extract)))
            names.append(id_extract.lower())
        
        except:
            try:
                # Some files have a name with CH4_Air instead of CH4
                columns.append(
                    head.index(
                        remap_extract(id_extract) + '_air'))
                names.append(id_extract.lower())
            
            except:
                try:
                    # Look into the header
                    id_value = find_header(id_extract, header)
                    extra[id_extract.lower()] = id_value
                
                except Exception as e:
                    # If cannot find,
                    # assume default values for unit and timezone
                    check.verbose("Cant extract " + id_extract)

                    if id_extract == 'units':
                        extra[id_extract] = default_unit
                    
                    elif id_extract == 'time':
                        extra[id_extract] = default_tz
                    

    return names, columns, date_ids, extra