6 files changed, 335 insertions, 0 deletions
diff --git a/workflows/pubseq/normalize-yamlfa.py b/workflows/pubseq/normalize-yamlfa.py
new file mode 100755
index 0000000..55a8848
--- /dev/null
+++ b/workflows/pubseq/normalize-yamlfa.py
@@ -0,0 +1,97 @@
+# --- Normalize data
+# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
+#
+# Example:
+#
+#    python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate
+
+import argparse
+import json
+import os
+import sys
+import types
+import normalize.mapping as mapping
+
+parser = argparse.ArgumentParser(description="""
+
+Normalize parameters in PubSeq JSON/YAML files. All entries in
+directory are parsed using the state.json file. It is possible
+to select a subset of IDs.
+
+This tool has two modes of operation. It can validate with the
+--validate switch which stops at a warning and does no rewriting.
+This mode is typically used in troubleshooting.
+
+The other mode is --rewrite which rewrites the JSON files after
+making a backup (.bak) of the original. This mode updates files and
+won't stop - it is used for (automated) uploads.
+
+""")
+
+parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
+parser.add_argument('--species', type=str, help='Species mapping file')
+parser.add_argument('--specimen', type=str, help='Optional specimen mapping file')
+parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
+parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
+parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
+parser.add_argument('id', nargs='*', help='optional id(s)')
+
+args = parser.parse_args()
+
+with open(args.state) as jsonf:
+    data = json.load(jsonf)
+
+dir = os.path.dirname(args.state)
+do_validate = args.validate
+do_rewrite = args.rewrite
+
+ids = args.id
+if not len(ids):
+    ids = list(data.keys())
+
+species = {}
+if args.species:
+    with open(args.species) as f:
+        for line in f:
+            name,uri = line.strip().split(',')
+            species[name] = uri
+else:
+    print("WARNING: no species mapping",file=sys.stderr)
+specimen = {}
+if args.specimen:
+    with open(args.specimen) as f:
+        for line in f:
+            name,uri = line.strip().split(',')
+            specimen[name] = uri
+else:
+    print("WARNING: no specimen mapping",file=sys.stderr)
+
+for id in ids:
+    if args.yaml:
+        raise Exception("YAML not yet supported")
+    fn = f"{dir}/{id}.json"
+    print(f"Reading {fn}",file=sys.stderr)
+    with open(fn) as f:
+        rec = types.SimpleNamespace(**json.load(f))
+        if do_validate:
+            print(rec)
+        rec.host,warning = mapping.host_species(rec.host,species)
+        if warning:
+            print("WARNING "+warning,file=sys.stderr)
+            rec.warnings.append(warning)
+        rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
+        if warning:
+            print("WARNING "+warning,file=sys.stderr)
+            rec.warnings.append(warning)
+        print(rec)
+        if do_validate and warning:
+            print("bailing out in validation mode",file=sys.stderr)
+            sys.exit(2)
+        if do_rewrite:
+            if not os.path.exists(fn+".bak"): # make backup the first time
+                os.rename(fn,fn+".bak")
+            with open(fn, 'w') as outfile:
+                print(f"    Writing {fn}")
+                json.dump(rec.__dict__, outfile, indent=2)
+        else:
+            print(rec)
diff --git a/workflows/pubseq/normalize/README.md b/workflows/pubseq/normalize/README.md
new file mode 100644
index 0000000..b780a68
--- /dev/null
+++ b/workflows/pubseq/normalize/README.md
@@ -0,0 +1,14 @@
+# Normalization steps
+
+This library contains generic logic to normalize (string) data and
+transforms strings to URIs.  It should be applicable to data from
+any source (GenBank, ENA etc).
+
+Important: missing data should be missing or None! Do not fill
+in data by 'guessing'.
+
+When data is malformed a warning should be logged and added to the
+warning list. Functions should be small enough to return only 1
+warning!
+
+Pjotr Prins (c) 2021
diff --git a/workflows/pubseq/normalize/__init__.py b/workflows/pubseq/normalize/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/workflows/pubseq/normalize/__init__.py
diff --git a/workflows/pubseq/normalize/mapping.py b/workflows/pubseq/normalize/mapping.py
new file mode 100644
index 0000000..3ed09c2
--- /dev/null
+++ b/workflows/pubseq/normalize/mapping.py
@@ -0,0 +1,102 @@
+# Normalization steps
+#
+# This library contains generic logic to normalize (string) data and
+# transforms strings to URIs.  It should be applicable to data from
+# any source (GenBank, ENA etc).
+#
+#   Important: missing data should be missing or None! Do not fill
+#   in data by 'guessing'.
+#
+#   When data is malformed a warning should be logged and added to the
+#   warning list. Functions should be small enough to return only 1
+#   warning!
+#
+#   Pjotr Prins (c) 2021
+
+import re
+import types
+
+def host_species(host,mapping):
+    Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+
+    SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+        "human": Homo_sapiens,
+        "sapiens": Homo_sapiens,
+        "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
+        "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
+        "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
+        "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
+        "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
+        # Mink:
+        "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
+        }
+
+    warning = None
+    host = types.SimpleNamespace(**host)
+    if not 'obolibrary' in host.host_species:
+        key = host.host_species
+        host.host_species = None
+        if key in mapping:
+            host.host_species = mapping[key]
+        else:
+            for term in SPECIES_TERMS:
+                p = re.compile(".*?"+term,re.IGNORECASE)
+                m = p.match(key)
+                if m: host.host_species = SPECIES_TERMS[term]
+        if not host.host_species:
+            warning = f"No URI mapping for host_species <{key}>"
+        if host.host_species == Unknown or host.host_species == None:
+            del(host.host_species)
+    return host.__dict__,warning
+
+Unknown = "Not found" # So as not to create a warning
+
+def specimen_source(sample,mapping):
+    Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835"
+    Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835"
+    Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
+    Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
+    Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
+    Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
+    Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+    Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
+    Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
+    SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+        "Oronasopharynx": Oronasopharynx,
+        "orophar": Oropharyngeal,
+        "pharyngeal": Nasopharyngeal,
+        "\snares": Nasal_Swab,
+        "saliva": Saliva,
+        "swab": Nasal_Swab,
+        "broncho": Bronchoalveolar_Lavage,
+        "seafood": Frozen_Food,
+        "packaging": Frozen_Food,
+        "specimen": Biospecimen,
+        "patient": Biospecimen,
+        "uknown": Unknown,
+        "unknown": Unknown
+        }
+    warning = None
+    sample = types.SimpleNamespace(**sample)
+    try:
+        if sample.specimen_source:
+            keys = sample.specimen_source
+            sample.specimen_source = []
+            for key in keys:
+                if 'obolibrary' in key:
+                    sample.specimen_source.append(key)
+                    continue
+                if key in mapping:
+                    sample.specimen_source.append(mapping[key])
+                else:
+                    for term in SPECIMEN_TERMS:
+                        p = re.compile(".*?"+term,re.IGNORECASE)
+                        m = p.match(key)
+                        if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
+                if len(sample.specimen_source)==0:
+                    warning = f"No URI mapping for specimen_source <{key}>"
+        if sample.specimen_source == Unknown or sample.specimen_source == None:
+            del(sample.specimen_source)
+    except AttributeError:
+        pass
+    return sample.__dict__,warning
diff --git a/workflows/pubseq/pubseq-fetch-data.py b/workflows/pubseq/pubseq-fetch-data.py
new file mode 100755
index 0000000..ef4edde
--- /dev/null
+++ b/workflows/pubseq/pubseq-fetch-data.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import sys
+import time
+
+parser = argparse.ArgumentParser(description="""
+
+Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
+can be passed in on the command line or in a file.
+
+""")
+parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
+parser.add_argument('--out', type=str, help='Directory to write to',
+required=True)
+parser.add_argument('--ids', type=str, help='File with ids', required=False)
+parser.add_argument('id', nargs='*', help='id(s)')
+args = parser.parse_args()
+
+dir = args.out
+if not os.path.exists(dir):
+    raise Exception(f"Directory {dir} does not exist")
+
+ids = args.id
+if (len(ids)==0):
+    print(f"Reading {args.ids}")
+    with open(args.ids) as f:
+        ids = [ l.strip() for l in f.readlines() ]
+
+for id in ids:
+    print(id)
+    jsonfn = dir+"/"+id+".json"
+    if not os.path.exists(jsonfn):
+        count = 0
+        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        while not r:
+            count += 1
+            if count>10: raise Exception(f"Can not find record for {id}")
+            time.sleep(15)
+            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        with open(dir+"/"+id+".json","w") as outf:
+            outf.write(mr.text)
+        if args.fasta:
+            fastafn = dir+"/"+id+".fa"
+            if os.path.exists(fastafn): continue
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(fastafn,"w") as outf:
+                outf.write(fr.text)
+
diff --git a/workflows/pubseq/pubseq-fetch-ids b/workflows/pubseq/pubseq-fetch-ids
new file mode 100755
index 0000000..f5920ec
--- /dev/null
+++ b/workflows/pubseq/pubseq-fetch-ids
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+#   pubseq-fetch-ids > pubseq_ids.txt
+#
+# Note: requires Ruby 3.x. Older Ruby gives a syntax error
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+require 'pp'
+
+MAX=5_000
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
+def sparql query, transform = nil
+  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
+
+  response = Net::HTTP.get_response(URI.parse(api_url))
+  data = JSON.parse(response.body,symbolize_names: true)
+  data => { head: { vars: }, results: { bindings: results} }
+  vars = vars.map { |v| v.to_sym }
+  results.map { |rec|
+    # return results after transforming to a Hash and applying the
+    # optional transform lambda. Note the transform can not only
+    # reduce results, or create an array, but also may transform into
+    # an OpenStruct.
+    res = {}
+    vars.each { |name| res[name] = rec[name][:value] }
+    if transform
+      transform.call(res)
+    else
+      res
+    end
+  }
+end
+
+start = 0
+num = MAX
+begin
+  query = "
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
+
+  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} LIMIT #{num} OFFSET #{start}
+"
+  list = sparql(query, lambda { |rec| rec[:id] })
+  list.each do | l |
+    print(l,"\n")
+  end
+  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
+  start += num
+end while list.size == MAX