diff options
author | Pjotr Prins | 2021-01-04 08:58:38 +0000 |
---|---|---|
committer | Pjotr Prins | 2021-01-04 08:58:38 +0000 |
commit | 1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch) | |
tree | 34cc42ef12b81c05be8a57ca2a973b97e52f8461 /workflows/tools/normalize-yamlfa.py | |
parent | ba4161b1660c3a67090dd3715e9862906fb1cc5f (diff) | |
download | bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip |
Started on normalization
Diffstat (limited to 'workflows/tools/normalize-yamlfa.py')
-rwxr-xr-x | workflows/tools/normalize-yamlfa.py | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py new file mode 100755 index 0000000..e3f92c0 --- /dev/null +++ b/workflows/tools/normalize-yamlfa.py @@ -0,0 +1,97 @@ +# --- Normalize data +# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s) +# +# Example: +# +# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv + +import argparse +import json +import os +import sys +import types +import normalize.mapping as mapping + +parser = argparse.ArgumentParser(description=""" + +Normalize parameters in PubSeq JSON/YAML files. All entries in +directory are parsed using the state.json file. It is possible +to select a subset of IDs. + +This tool has two modes of operation. It can validate with the +`--validate` switch which stops at a warning and does no rewriting. +This mode is typically used in troubleshooting. + +The other mode is `--rewrite` which rewrites the JSON files after +making a backup (.bak) of the original. This mode updates files and +won't stop - it is used for (automated) uploads. + +""") + +parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True) +parser.add_argument('--species', type=str, help='Species mapping file') +parser.add_argument('--specimen', type=str, help='Specimen mapping file') +parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning') +parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files') +parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON') +parser.add_argument('id', nargs='*', help='optional id(s)') + +args = parser.parse_args() + +with open(args.state) as jsonf: + data = json.load(jsonf) + +dir = os.path.dirname(args.state) +do_validate = args.validate +do_rewrite = args.rewrite + +ids = args.id +if not len(ids): + ids = list(data.keys()) + +species = {} +if args.species: + with open(args.species) as f: + for line in f: + name,uri = line.strip().split(',') + species[name] = uri +else: + print("WARNING: no species mapping",file=sys.stderr) +specimen = {} +if args.specimen: + with open(args.specimen) as f: + for line in f: + name,uri = line.strip().split(',') + specimen[name] = uri +else: + print("WARNING: no specimen mapping",file=sys.stderr) + +for id in ids: + if args.yaml: + raise Exception("YAML not yet supported") + fn = f"{dir}/{id}.json" + print(f"Reading {fn}",file=sys.stderr) + with open(fn) as f: + rec = types.SimpleNamespace(**json.load(f)) + if do_validate: + print(rec) + rec.host,warning = mapping.host_species(rec.host,species) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + rec.sample,warning = mapping.specimen_source(rec.sample,specimen) + if warning: + print("WARNING "+warning,file=sys.stderr) + rec.warnings.append(warning) + print(rec) + if do_validate and warning: + print("bailing out in validation mode",file=sys.stderr) + sys.exit(2) + if do_rewrite: + if not os.path.exists(fn+".bak"): # make backup the first time + os.rename(fn,fn+".bak") + with open(fn, 'w') as outfile: + print(f" Writing {fn}") + json.dump(rec.__dict__, outfile, indent=4) + else: + print(rec) |