1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
# --- Normalize data
# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
#
# Example:
#
# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate
import argparse
import json
import os
import sys
import types
import normalize.mapping as mapping
parser = argparse.ArgumentParser(description="""
Normalize parameters in PubSeq JSON/YAML files. All entries in
directory are parsed using the state.json file. It is possible
to select a subset of IDs.
This tool has two modes of operation. It can validate with the
--validate switch which stops at a warning and does no rewriting.
This mode is typically used in troubleshooting.
The other mode is --rewrite which rewrites the JSON files after
making a backup (.bak) of the original. This mode updates files and
won't stop - it is used for (automated) uploads.
""")
parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
parser.add_argument('--species', type=str, help='Species mapping file')
parser.add_argument('--specimen', type=str, help='Optional specimen mapping file')
parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
parser.add_argument('id', nargs='*', help='optional id(s)')
args = parser.parse_args()
with open(args.state) as jsonf:
data = json.load(jsonf)
dir = os.path.dirname(args.state)
do_validate = args.validate
do_rewrite = args.rewrite
ids = args.id
if not len(ids):
ids = list(data.keys())
species = {}
if args.species:
with open(args.species) as f:
for line in f:
name,uri = line.strip().split(',')
species[name] = uri
else:
print("WARNING: no species mapping",file=sys.stderr)
specimen = {}
if args.specimen:
with open(args.specimen) as f:
for line in f:
name,uri = line.strip().split(',')
specimen[name] = uri
else:
print("WARNING: no specimen mapping",file=sys.stderr)
for id in ids:
if args.yaml:
raise Exception("YAML not yet supported")
fn = f"{dir}/{id}.json"
print(f"Reading {fn}",file=sys.stderr)
with open(fn) as f:
rec = types.SimpleNamespace(**json.load(f))
if do_validate:
print(rec)
rec.host,warning = mapping.host_species(rec.host,species)
if warning:
print("WARNING "+warning,file=sys.stderr)
rec.warnings.append(warning)
rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
if warning:
print("WARNING "+warning,file=sys.stderr)
rec.warnings.append(warning)
print(rec)
if do_validate and warning:
print("bailing out in validation mode",file=sys.stderr)
sys.exit(2)
if do_rewrite:
if not os.path.exists(fn+".bak"): # make backup the first time
os.rename(fn,fn+".bak")
with open(fn, 'w') as outfile:
print(f" Writing {fn}")
json.dump(rec.__dict__, outfile, indent=2)
else:
print(rec)
|