Started on normalization

author: Pjotr Prins 2021-01-04 08:58:38 +0000
committer: Pjotr Prins 2021-01-04 08:58:38 +0000
commit: 1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree: 34cc42ef12b81c05be8a57ca2a973b97e52f8461
parent: ba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
download: bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip
10 files changed, 185 insertions, 9 deletions
diff --git a/workflows/pull-data/genbank/.gitignore b/workflows/pull-data/genbank/.gitignore
index 69b8a57..8bfdb5b 100644
--- a/workflows/pull-data/genbank/.gitignore
+++ b/workflows/pull-data/genbank/.gitignore
@@ -1,3 +1,4 @@
 fasta_and_yaml/
 *.tsv
 *.acc
+*.txt
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index b5bac84..d7cc15f 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -1,14 +1,25 @@
-# pipeline
+# GenBank
+
+This directory contains the tools to pull and transform
+GenBank data.
+
+# Workflows
+
+## Prepare new GenBank data for upload
+
+The following workflow sends GenBank data into PubSeq
 
 ```sh
 # --- get list of IDs already in PubSeq
-./sparql-fetch-ids > pubseq_ids.txt
+../../tools/sparql-fetch-ids > pubseq_ids.txt
 # --- get list of missing genbank IDs
 ./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
 # --- fetch XML
 python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank
-# --- Transform to YAML and FASTA
-python3 transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
+# --- Transform to YAML/JSON and FASTA
+python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s)
+# --- Normalize data
+../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s)
 ```
 
 # TODO
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 26cb5e7..85d615c 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -1,4 +1,6 @@
 # Genbank XML parser
+#
+# Pjotr Prins (c) 2021
 
 from collections import namedtuple
 import dateutil
@@ -59,7 +61,7 @@ Example of an output JSON:
 def get_metadata(id, gbseq):
     """This is a minimal data parser from genbank XML records. Inference
     on, for example geo location, is not allowed in this function and
-    happens downstream.
+    happens downstream (in normalize).
 
     That is to keep the parsing simple.
 
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index ebdf17e..9414864 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -1,18 +1,17 @@
 #!/usr/bin/env python3
 #
-# Create a single YAML/FASTA from genbank XML
+# Create a single YAML/FASTA for each genbank entry in GenBank XML file
 #
 #   transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
 #
 # Also writes a validation file in the outdir named state.json
-#
-# Where --in can be a file or a directory
 # ----------------------------------------------------------------------
 
 # See also directory .guix-run and README.md
 
 import argparse
 import gzip
+import json
 import os
 import sys
 import types
@@ -47,6 +46,12 @@ for xmlfn in args.files:
             try:
                 valid,meta = genbank.get_metadata(id,gb)
                 if valid:
+                    # --- write JSON
+                    jsonfn = basename + ".json"
+                    with open(jsonfn, 'w') as outfile:
+                        print(f"    writing {jsonfn}")
+                        json.dump(meta, outfile, indent=4)
+                    # --- write FASTA
                     fa = basename+".fa"
                     seq = genbank.get_sequence(id,gb)
                     print(f"    writing {fa}")
@@ -66,4 +71,7 @@ for xmlfn in args.files:
                 state['warnings'] = meta['warnings']
             states[id] = state
 
-print(states)
+statefn = dir + '/state.json'
+with open(statefn, 'w') as outfile:
+    print(f"    Writing {statefn}")
+    json.dump(states, outfile, indent=4)
diff --git a/workflows/tools b/workflows/tools
deleted file mode 160000
-Subproject c67c011765bea798a24485cbe0a1c6c59243652
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
new file mode 100755
index 0000000..e3f92c0
--- /dev/null
+++ b/workflows/tools/normalize-yamlfa.py
@@ -0,0 +1,97 @@
+# --- Normalize data
+# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
+#
+# Example:
+#
+#    python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv
+
+import argparse
+import json
+import os
+import sys
+import types
+import normalize.mapping as mapping
+
+parser = argparse.ArgumentParser(description="""
+
+Normalize parameters in PubSeq JSON/YAML files. All entries in
+directory are parsed using the state.json file. It is possible
+to select a subset of IDs.
+
+This tool has two modes of operation. It can validate with the
+`--validate` switch which stops at a warning and does no rewriting.
+This mode is typically used in troubleshooting.
+
+The other mode is `--rewrite` which rewrites the JSON files after
+making a backup (.bak) of the original. This mode updates files and
+won't stop - it is used for (automated) uploads.
+
+""")
+
+parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
+parser.add_argument('--species', type=str, help='Species mapping file')
+parser.add_argument('--specimen', type=str, help='Specimen mapping file')
+parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
+parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
+parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
+parser.add_argument('id', nargs='*', help='optional id(s)')
+
+args = parser.parse_args()
+
+with open(args.state) as jsonf:
+    data = json.load(jsonf)
+
+dir = os.path.dirname(args.state)
+do_validate = args.validate
+do_rewrite = args.rewrite
+
+ids = args.id
+if not len(ids):
+    ids = list(data.keys())
+
+species = {}
+if args.species:
+    with open(args.species) as f:
+        for line in f:
+            name,uri = line.strip().split(',')
+            species[name] = uri
+else:
+    print("WARNING: no species mapping",file=sys.stderr)
+specimen = {}
+if args.specimen:
+    with open(args.specimen) as f:
+        for line in f:
+            name,uri = line.strip().split(',')
+            specimen[name] = uri
+else:
+    print("WARNING: no specimen mapping",file=sys.stderr)
+
+for id in ids:
+    if args.yaml:
+        raise Exception("YAML not yet supported")
+    fn = f"{dir}/{id}.json"
+    print(f"Reading {fn}",file=sys.stderr)
+    with open(fn) as f:
+        rec = types.SimpleNamespace(**json.load(f))
+        if do_validate:
+            print(rec)
+        rec.host,warning = mapping.host_species(rec.host,species)
+        if warning:
+            print("WARNING "+warning,file=sys.stderr)
+            rec.warnings.append(warning)
+        rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
+        if warning:
+            print("WARNING "+warning,file=sys.stderr)
+            rec.warnings.append(warning)
+        print(rec)
+        if do_validate and warning:
+            print("bailing out in validation mode",file=sys.stderr)
+            sys.exit(2)
+        if do_rewrite:
+            if not os.path.exists(fn+".bak"): # make backup the first time
+                os.rename(fn,fn+".bak")
+            with open(fn, 'w') as outfile:
+                print(f"    Writing {fn}")
+                json.dump(rec.__dict__, outfile, indent=4)
+        else:
+            print(rec)
diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md
new file mode 100644
index 0000000..b780a68
--- /dev/null
+++ b/workflows/tools/normalize/README.md
@@ -0,0 +1,14 @@
+# Normalization steps
+
+This library contains generic logic to normalize (string) data and
+transforms strings to URIs.  It should be applicable to data from
+any source (GenBank, ENA etc).
+
+Important: missing data should be missing or None! Do not fill
+in data by 'guessing'.
+
+When data is malformed a warning should be logged and added to the
+warning list. Functions should be small enough to return only 1
+warning!
+
+Pjotr Prins (c) 2021
diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/workflows/tools/normalize/__init__.py
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
new file mode 100644
index 0000000..1d52b03
--- /dev/null
+++ b/workflows/tools/normalize/mapping.py
@@ -0,0 +1,43 @@
+# Normalization steps
+#
+# This library contains generic logic to normalize (string) data and
+# transforms strings to URIs.  It should be applicable to data from
+# any source (GenBank, ENA etc).
+#
+#   Important: missing data should be missing or None! Do not fill
+#   in data by 'guessing'.
+#
+#   When data is malformed a warning should be logged and added to the
+#   warning list. Functions should be small enough to return only 1
+#   warning!
+#
+#   Pjotr Prins (c) 2021
+
+import types
+
+def host_species(host,mapping):
+    warning = None
+    host = types.SimpleNamespace(**host)
+    if not 'obolibrary' in host.host_species:
+        key = host.host_species
+        if key in mapping:
+            host.host_species = mapping[key]
+        else:
+            warning = f"No URI mapping for host_species <{key}>"
+    return host.__dict__,warning
+
+def specimen_source(sample,mapping):
+    warning = None
+    sample = types.SimpleNamespace(**sample)
+    try:
+        if sample.specimen_source and not 'obolibrary' in sample.specimen_source:
+            key = sample.specimen_source
+            if key in mapping:
+                sample.specimen_source = mapping[key]
+            else:
+                sample.specimen_source = None
+                warning = f"No URI mapping for specimen_source <{key}>"
+    except AttributeError:
+        pass
+    if not sample.specimen_source: del(sample.specimen_source)
+    return sample.__dict__,warning
diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids
index 19b2d82..19b2d82 100755
--- a/workflows/pull-data/genbank/sparql-fetch-ids
+++ b/workflows/tools/sparql-fetch-ids
author	Pjotr Prins	2021-01-04 08:58:38 +0000
committer	Pjotr Prins	2021-01-04 08:58:38 +0000
commit	1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree	34cc42ef12b81c05be8a57ca2a973b97e52f8461
parent	ba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
download	bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip