aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 08:58:38 +0000
committerPjotr Prins2021-01-04 08:58:38 +0000
commit1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree34cc42ef12b81c05be8a57ca2a973b97e52f8461 /workflows/tools
parentba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
downloadbh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip
Started on normalization
Diffstat (limited to 'workflows/tools')
m---------workflows/tools0
-rwxr-xr-xworkflows/tools/normalize-yamlfa.py97
-rw-r--r--workflows/tools/normalize/README.md14
-rw-r--r--workflows/tools/normalize/__init__.py0
-rw-r--r--workflows/tools/normalize/mapping.py43
-rwxr-xr-xworkflows/tools/sparql-fetch-ids67
6 files changed, 221 insertions, 0 deletions
diff --git a/workflows/tools b/workflows/tools
deleted file mode 160000
-Subproject c67c011765bea798a24485cbe0a1c6c59243652
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
new file mode 100755
index 0000000..e3f92c0
--- /dev/null
+++ b/workflows/tools/normalize-yamlfa.py
@@ -0,0 +1,97 @@
+# --- Normalize data
+# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
+#
+# Example:
+#
+# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv
+
+import argparse
+import json
+import os
+import sys
+import types
+import normalize.mapping as mapping
+
+parser = argparse.ArgumentParser(description="""
+
+Normalize parameters in PubSeq JSON/YAML files. All entries in
+directory are parsed using the state.json file. It is possible
+to select a subset of IDs.
+
+This tool has two modes of operation. It can validate with the
+`--validate` switch which stops at a warning and does no rewriting.
+This mode is typically used in troubleshooting.
+
+The other mode is `--rewrite` which rewrites the JSON files after
+making a backup (.bak) of the original. This mode updates files and
+won't stop - it is used for (automated) uploads.
+
+""")
+
+parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
+parser.add_argument('--species', type=str, help='Species mapping file')
+parser.add_argument('--specimen', type=str, help='Specimen mapping file')
+parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
+parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
+parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
+parser.add_argument('id', nargs='*', help='optional id(s)')
+
+args = parser.parse_args()
+
+with open(args.state) as jsonf:
+ data = json.load(jsonf)
+
+dir = os.path.dirname(args.state)
+do_validate = args.validate
+do_rewrite = args.rewrite
+
+ids = args.id
+if not len(ids):
+ ids = list(data.keys())
+
+species = {}
+if args.species:
+ with open(args.species) as f:
+ for line in f:
+ name,uri = line.strip().split(',')
+ species[name] = uri
+else:
+ print("WARNING: no species mapping",file=sys.stderr)
+specimen = {}
+if args.specimen:
+ with open(args.specimen) as f:
+ for line in f:
+ name,uri = line.strip().split(',')
+ specimen[name] = uri
+else:
+ print("WARNING: no specimen mapping",file=sys.stderr)
+
+for id in ids:
+ if args.yaml:
+ raise Exception("YAML not yet supported")
+ fn = f"{dir}/{id}.json"
+ print(f"Reading {fn}",file=sys.stderr)
+ with open(fn) as f:
+ rec = types.SimpleNamespace(**json.load(f))
+ if do_validate:
+ print(rec)
+ rec.host,warning = mapping.host_species(rec.host,species)
+ if warning:
+ print("WARNING "+warning,file=sys.stderr)
+ rec.warnings.append(warning)
+ rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
+ if warning:
+ print("WARNING "+warning,file=sys.stderr)
+ rec.warnings.append(warning)
+ print(rec)
+ if do_validate and warning:
+ print("bailing out in validation mode",file=sys.stderr)
+ sys.exit(2)
+ if do_rewrite:
+ if not os.path.exists(fn+".bak"): # make backup the first time
+ os.rename(fn,fn+".bak")
+ with open(fn, 'w') as outfile:
+ print(f" Writing {fn}")
+ json.dump(rec.__dict__, outfile, indent=4)
+ else:
+ print(rec)
diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md
new file mode 100644
index 0000000..b780a68
--- /dev/null
+++ b/workflows/tools/normalize/README.md
@@ -0,0 +1,14 @@
+# Normalization steps
+
+This library contains generic logic to normalize (string) data and
+transforms strings to URIs. It should be applicable to data from
+any source (GenBank, ENA etc).
+
+Important: missing data should be missing or None! Do not fill
+in data by 'guessing'.
+
+When data is malformed a warning should be logged and added to the
+warning list. Functions should be small enough to return only 1
+warning!
+
+Pjotr Prins (c) 2021
diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/workflows/tools/normalize/__init__.py
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
new file mode 100644
index 0000000..1d52b03
--- /dev/null
+++ b/workflows/tools/normalize/mapping.py
@@ -0,0 +1,43 @@
+# Normalization steps
+#
+# This library contains generic logic to normalize (string) data and
+# transforms strings to URIs. It should be applicable to data from
+# any source (GenBank, ENA etc).
+#
+# Important: missing data should be missing or None! Do not fill
+# in data by 'guessing'.
+#
+# When data is malformed a warning should be logged and added to the
+# warning list. Functions should be small enough to return only 1
+# warning!
+#
+# Pjotr Prins (c) 2021
+
+import types
+
+def host_species(host,mapping):
+ warning = None
+ host = types.SimpleNamespace(**host)
+ if not 'obolibrary' in host.host_species:
+ key = host.host_species
+ if key in mapping:
+ host.host_species = mapping[key]
+ else:
+ warning = f"No URI mapping for host_species <{key}>"
+ return host.__dict__,warning
+
+def specimen_source(sample,mapping):
+ warning = None
+ sample = types.SimpleNamespace(**sample)
+ try:
+ if sample.specimen_source and not 'obolibrary' in sample.specimen_source:
+ key = sample.specimen_source
+ if key in mapping:
+ sample.specimen_source = mapping[key]
+ else:
+ sample.specimen_source = None
+ warning = f"No URI mapping for specimen_source <{key}>"
+ except AttributeError:
+ pass
+ if not sample.specimen_source: del(sample.specimen_source)
+ return sample.__dict__,warning
diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids
new file mode 100755
index 0000000..19b2d82
--- /dev/null
+++ b/workflows/tools/sparql-fetch-ids
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+# sparql-fetch-ids > pubseq_ids.txt
+#
+# Note: requires Ruby 3.x. Older Ruby gives a syntax error
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+require 'pp'
+
+MAX=5_000
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
+def sparql query, transform = nil
+ api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
+
+ response = Net::HTTP.get_response(URI.parse(api_url))
+ data = JSON.parse(response.body,symbolize_names: true)
+ data => { head: { vars: }, results: { bindings: results} }
+ vars = vars.map { |v| v.to_sym }
+ results.map { |rec|
+ # return results after transforming to a Hash and applying the
+ # optional transform lambda. Note the transform can not only
+ # reduce results, or create an array, but also may transform into
+ # an OpenStruct.
+ res = {}
+ vars.each { |name| res[name] = rec[name][:value] }
+ if transform
+ transform.call(res)
+ else
+ res
+ end
+ }
+end
+
+start = 0
+num = MAX
+begin
+ query = "
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
+
+ ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} LIMIT #{num} OFFSET #{start}
+"
+ list = sparql(query, lambda { |rec| rec[:id] })
+ list.each do | l |
+ print(l,"\n")
+ end
+ $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
+ start += num
+end while list.size == MAX