Source code for dabu.check_nasa_ames_format.check_nasa_ames_format
"""
:Author: Daniel Mohr
:Email: daniel.mohr@dlr.de
:Date: 2021-02-08 (last change).
:License: GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007.
"""
import datetime
import re
import time
def date_rdate2isoformat(data):
"""
:Author: Daniel Mohr
:Email: daniel.mohr@dlr.de
:Date: 2021-02-08 (last change).
"""
splited = data.split()
return datetime.date(int(splited[0]),
int(splited[1]),
int(splited[2])).isoformat()
[docs]def check_nasa_ames_format(filename, output_format='human_readable'):
"""
:Author: Daniel Mohr
:Email: daniel.mohr@dlr.de
:Date: 2021-02-08 (last change).
Checks the given file for the nasa ames format, see:
* http://cedadocs.ceda.ac.uk/73/
* http://cedadocs.ceda.ac.uk/73/4/index.html
* http://cedadocs.ceda.ac.uk/73/4/FFI-summary.html
:param filename: file to analyse
"""
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
result = dict()
checker_name = 'pydabu (nasa ames format check)'
addresult = dict()
result[checker_name] = dict()
result[checker_name]['error'] = 0
result[checker_name]['warning'] = 0
result[checker_name]['log'] = []
metadata_part = []
with open(filename, mode='r') as fd:
# pylint: disable=unused-variable
for i in range(7):
metadata_part += [fd.readline()]
if len(metadata_part) == 7:
nlhead_ffi = metadata_part[0].strip().split()
if isinstance(nlhead_ffi, list) and len(nlhead_ffi) == 2:
# NLHEAD: Number of lines in file header
# FFI: File format index
addresult['NLHEAD'], addresult['FFI'] = map(int, nlhead_ffi)
else:
result[checker_name]['log'] += [
'error: '
'no nasa ames format detected (cannot analyse first line)']
result[checker_name]['error'] += 1
if result[checker_name]['error'] == 0:
if bool(metadata_part[5]): # len(metadata_part[5]) > 0
ivol_nvol = metadata_part[5].strip().split()
if isinstance(ivol_nvol, list) and len(ivol_nvol) == 2:
# IVOL: Number of the file in the above dataset
# (between 1 and NVOL).
# NVOL: Total number of files belonging to the considered
# dataset (i.e. with same ONAME, ORG, SNAME, MNAME).
ivol, nvol = map(int, ivol_nvol)
if 1 <= ivol <= nvol:
addresult['IVOL'] = ivol
addresult['NVOL'] = nvol
else:
result[checker_name]['log'] += [
'error: do not understand IVOL and NVOL']
result[checker_name]['error'] += 1
else:
result[checker_name]['log'] += [
'error: cannot extract IVOL and NVOL']
result[checker_name]['error'] += 1
else:
result[checker_name]['log'] += [
'error: IVOL and NVOL not found']
result[checker_name]['error'] += 1
if result[checker_name]['error'] == 0:
if bool(metadata_part[1]): # len(metadata_part[1]) > 0
if len(metadata_part[1]) < 132 + 1:
# ONAME: List of author(s) in the format Lastname,
# Firstname; separated by an arbitrary character
# (for example, a hyphen or a semi-colon).
# since it is hard to automatic split at an arbitrary
# character, we only check for a comma
if ',' in metadata_part[1]:
addresult['ONAME'] = metadata_part[1].strip()
else:
result[checker_name]['log'] += [
'warning: do not understand ONAME format']
result[checker_name]['warning'] += 1
else:
result[checker_name]['log'] += [
'warning: ONAME too long']
result[checker_name]['warning'] += 1
else:
result[checker_name]['log'] += [
'warning: ONAME is empty']
result[checker_name]['warning'] += 1
for (pos, tag) in [(2, 'ORG'), (3, 'SNAME'), (4, 'MNAME')]:
# ORG: Organisation name (university, institute, etc).
# May include address and phone numbers.
# SNAME: Source of data, i.e. instrument, platform, model name,
# etc.
# MNAME: Name of mission, campaign, programme and/or project.
# NVOL: Total number of files belonging to the considered
# dataset (i.e. with same ONAME, ORG, SNAME, MNAME).
if bool(metadata_part[pos]): # len(metadata_part[pos]) > 0
if len(metadata_part[pos]) < 132 + 1:
addresult[tag] = metadata_part[pos].strip()
else:
result[checker_name]['log'] += [
'warning: ' + tag + ' too long']
result[checker_name]['warning'] += 1
if bool(metadata_part[6]): # len(metadata_part[6]) > 0
date_rdate = re.findall(
r'([0-9]{4}[ ]{1,2}[0-9]{1,2}[ ]{1,2}[0-9]{1,2})',
metadata_part[6].strip())
if len(date_rdate) > 2:
result[checker_name]['log'] += [
'warning: too many "dates" in DATE RDATE']
result[checker_name]['warning'] += 1
elif len(date_rdate) == 1:
addresult['DATE'] = date_rdate2isoformat(date_rdate[0])
elif len(date_rdate) == 2:
date_rdate = re.findall(
r'([0-9]{4}[ ]{1,2}[0-9]{1,2}[ ]{1,2}[0-9]{1,2})'
r'\s*'
r'([0-9]{4}[ ]{1,2}[0-9]{1,2}[ ]{1,2}[0-9]{1,2})',
metadata_part[6].strip())
if date_rdate:
addresult['DATE'] = date_rdate2isoformat(
date_rdate[0][0])
addresult['RDATE'] = date_rdate2isoformat(
date_rdate[0][1])
else:
result[checker_name]['log'] += [
'warning: do not understand DATE RDATE']
result[checker_name]['warning'] += 1
else:
result[checker_name]['log'] += [
'warning: do not understand DATE RDATE']
result[checker_name]['warning'] += 1
result[checker_name]['created'] = time.time()
if output_format != 'human_readable':
for key in addresult:
result[checker_name][key] = addresult[key]
return result