workflows/tools/normalize-yamlfa.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

# --- Normalize data
# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
#
# Example:
#
#    python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate

import argparse
import json
import os
import sys
import types
import normalize.mapping as mapping

parser = argparse.ArgumentParser(description="""

Normalize parameters in PubSeq JSON/YAML files. All entries in
directory are parsed using the state.json file. It is possible
to select a subset of IDs.

This tool has two modes of operation. It can validate with the
`--validate` switch which stops at a warning and does no rewriting.
This mode is typically used in troubleshooting.

The other mode is `--rewrite` which rewrites the JSON files after
making a backup (.bak) of the original. This mode updates files and
won't stop - it is used for (automated) uploads.

""")

parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
parser.add_argument('--species', type=str, help='Species mapping file')
parser.add_argument('--specimen', type=str, help='Optional specimen mapping file')
parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
parser.add_argument('id', nargs='*', help='optional id(s)')

args = parser.parse_args()

with open(args.state) as jsonf:
    data = json.load(jsonf)

dir = os.path.dirname(args.state)
do_validate = args.validate
do_rewrite = args.rewrite

ids = args.id
if not len(ids):
    ids = list(data.keys())

species = {}
if args.species:
    with open(args.species) as f:
        for line in f:
            name,uri = line.strip().split(',')
            species[name] = uri
else:
    print("WARNING: no species mapping",file=sys.stderr)
specimen = {}
if args.specimen:
    with open(args.specimen) as f:
        for line in f:
            name,uri = line.strip().split(',')
            specimen[name] = uri
else:
    print("WARNING: no specimen mapping",file=sys.stderr)

for id in ids:
    if args.yaml:
        raise Exception("YAML not yet supported")
    fn = f"{dir}/{id}.json"
    print(f"Reading {fn}",file=sys.stderr)
    with open(fn) as f:
        rec = types.SimpleNamespace(**json.load(f))
        if do_validate:
            print(rec)
        rec.host,warning = mapping.host_species(rec.host,species)
        if warning:
            print("WARNING "+warning,file=sys.stderr)
            rec.warnings.append(warning)
        rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
        if warning:
            print("WARNING "+warning,file=sys.stderr)
            rec.warnings.append(warning)
        print(rec)
        if do_validate and warning:
            print("bailing out in validation mode",file=sys.stderr)
            sys.exit(2)
        if do_rewrite:
            if not os.path.exists(fn+".bak"): # make backup the first time
                os.rename(fn,fn+".bak")
            with open(fn, 'w') as outfile:
                print(f"    Writing {fn}")
                json.dump(rec.__dict__, outfile, indent=4)
        else:
            print(rec)