From 8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 28 Jan 2021 18:45:52 +0000
Subject: Moving tools out of submodules (sorry!)

---
 workflows/tools/normalize-yamlfa.py   |  97 --------------------------------
 workflows/tools/normalize/README.md   |  14 -----
 workflows/tools/normalize/__init__.py |   0
 workflows/tools/normalize/mapping.py  | 102 ----------------------------------
 workflows/tools/pubseq-fetch-data.py  |  55 ------------------
 workflows/tools/pubseq-fetch-ids      |  67 ----------------------
 6 files changed, 335 deletions(-)
 delete mode 100755 workflows/tools/normalize-yamlfa.py
 delete mode 100644 workflows/tools/normalize/README.md
 delete mode 100644 workflows/tools/normalize/__init__.py
 delete mode 100644 workflows/tools/normalize/mapping.py
 delete mode 100755 workflows/tools/pubseq-fetch-data.py
 delete mode 100755 workflows/tools/pubseq-fetch-ids

(limited to 'workflows/tools')

diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
deleted file mode 100755
index 55a8848..0000000
--- a/workflows/tools/normalize-yamlfa.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# --- Normalize data
-# normalize-yamlfa.py [--yaml] --in ~/tmp/pubseq/state.json file(s)
-#
-# Example:
-#
-#    python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate
-
-import argparse
-import json
-import os
-import sys
-import types
-import normalize.mapping as mapping
-
-parser = argparse.ArgumentParser(description="""
-
-Normalize parameters in PubSeq JSON/YAML files. All entries in
-directory are parsed using the state.json file. It is possible
-to select a subset of IDs.
-
-This tool has two modes of operation. It can validate with the
---validate switch which stops at a warning and does no rewriting.
-This mode is typically used in troubleshooting.
-
-The other mode is --rewrite which rewrites the JSON files after
-making a backup (.bak) of the original. This mode updates files and
-won't stop - it is used for (automated) uploads.
-
-""")
-
-parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
-parser.add_argument('--species', type=str, help='Species mapping file')
-parser.add_argument('--specimen', type=str, help='Optional specimen mapping file')
-parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
-parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
-parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
-parser.add_argument('id', nargs='*', help='optional id(s)')
-
-args = parser.parse_args()
-
-with open(args.state) as jsonf:
-    data = json.load(jsonf)
-
-dir = os.path.dirname(args.state)
-do_validate = args.validate
-do_rewrite = args.rewrite
-
-ids = args.id
-if not len(ids):
-    ids = list(data.keys())
-
-species = {}
-if args.species:
-    with open(args.species) as f:
-        for line in f:
-            name,uri = line.strip().split(',')
-            species[name] = uri
-else:
-    print("WARNING: no species mapping",file=sys.stderr)
-specimen = {}
-if args.specimen:
-    with open(args.specimen) as f:
-        for line in f:
-            name,uri = line.strip().split(',')
-            specimen[name] = uri
-else:
-    print("WARNING: no specimen mapping",file=sys.stderr)
-
-for id in ids:
-    if args.yaml:
-        raise Exception("YAML not yet supported")
-    fn = f"{dir}/{id}.json"
-    print(f"Reading {fn}",file=sys.stderr)
-    with open(fn) as f:
-        rec = types.SimpleNamespace(**json.load(f))
-        if do_validate:
-            print(rec)
-        rec.host,warning = mapping.host_species(rec.host,species)
-        if warning:
-            print("WARNING "+warning,file=sys.stderr)
-            rec.warnings.append(warning)
-        rec.sample,warning = mapping.specimen_source(rec.sample,specimen)
-        if warning:
-            print("WARNING "+warning,file=sys.stderr)
-            rec.warnings.append(warning)
-        print(rec)
-        if do_validate and warning:
-            print("bailing out in validation mode",file=sys.stderr)
-            sys.exit(2)
-        if do_rewrite:
-            if not os.path.exists(fn+".bak"): # make backup the first time
-                os.rename(fn,fn+".bak")
-            with open(fn, 'w') as outfile:
-                print(f"    Writing {fn}")
-                json.dump(rec.__dict__, outfile, indent=2)
-        else:
-            print(rec)
diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md
deleted file mode 100644
index b780a68..0000000
--- a/workflows/tools/normalize/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Normalization steps
-
-This library contains generic logic to normalize (string) data and
-transforms strings to URIs.  It should be applicable to data from
-any source (GenBank, ENA etc).
-
-Important: missing data should be missing or None! Do not fill
-in data by 'guessing'.
-
-When data is malformed a warning should be logged and added to the
-warning list. Functions should be small enough to return only 1
-warning!
-
-Pjotr Prins (c) 2021
diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
deleted file mode 100644
index 3ed09c2..0000000
--- a/workflows/tools/normalize/mapping.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Normalization steps
-#
-# This library contains generic logic to normalize (string) data and
-# transforms strings to URIs.  It should be applicable to data from
-# any source (GenBank, ENA etc).
-#
-#   Important: missing data should be missing or None! Do not fill
-#   in data by 'guessing'.
-#
-#   When data is malformed a warning should be logged and added to the
-#   warning list. Functions should be small enough to return only 1
-#   warning!
-#
-#   Pjotr Prins (c) 2021
-
-import re
-import types
-
-def host_species(host,mapping):
-    Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
-
-    SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
-        "human": Homo_sapiens,
-        "sapiens": Homo_sapiens,
-        "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
-        "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
-        "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
-        "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
-        "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
-        # Mink:
-        "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
-        }
-
-    warning = None
-    host = types.SimpleNamespace(**host)
-    if not 'obolibrary' in host.host_species:
-        key = host.host_species
-        host.host_species = None
-        if key in mapping:
-            host.host_species = mapping[key]
-        else:
-            for term in SPECIES_TERMS:
-                p = re.compile(".*?"+term,re.IGNORECASE)
-                m = p.match(key)
-                if m: host.host_species = SPECIES_TERMS[term]
-        if not host.host_species:
-            warning = f"No URI mapping for host_species <{key}>"
-        if host.host_species == Unknown or host.host_species == None:
-            del(host.host_species)
-    return host.__dict__,warning
-
-Unknown = "Not found" # So as not to create a warning
-
-def specimen_source(sample,mapping):
-    Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835"
-    Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835"
-    Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
-    Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
-    Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
-    Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
-    Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
-    Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
-    Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
-    SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
-        "Oronasopharynx": Oronasopharynx,
-        "orophar": Oropharyngeal,
-        "pharyngeal": Nasopharyngeal,
-        "\snares": Nasal_Swab,
-        "saliva": Saliva,
-        "swab": Nasal_Swab,
-        "broncho": Bronchoalveolar_Lavage,
-        "seafood": Frozen_Food,
-        "packaging": Frozen_Food,
-        "specimen": Biospecimen,
-        "patient": Biospecimen,
-        "uknown": Unknown,
-        "unknown": Unknown
-        }
-    warning = None
-    sample = types.SimpleNamespace(**sample)
-    try:
-        if sample.specimen_source:
-            keys = sample.specimen_source
-            sample.specimen_source = []
-            for key in keys:
-                if 'obolibrary' in key:
-                    sample.specimen_source.append(key)
-                    continue
-                if key in mapping:
-                    sample.specimen_source.append(mapping[key])
-                else:
-                    for term in SPECIMEN_TERMS:
-                        p = re.compile(".*?"+term,re.IGNORECASE)
-                        m = p.match(key)
-                        if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
-                if len(sample.specimen_source)==0:
-                    warning = f"No URI mapping for specimen_source <{key}>"
-        if sample.specimen_source == Unknown or sample.specimen_source == None:
-            del(sample.specimen_source)
-    except AttributeError:
-        pass
-    return sample.__dict__,warning
diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
deleted file mode 100755
index ef4edde..0000000
--- a/workflows/tools/pubseq-fetch-data.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import json
-import os
-import requests
-import sys
-import time
-
-parser = argparse.ArgumentParser(description="""
-
-Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
-can be passed in on the command line or in a file.
-
-""")
-parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
-parser.add_argument('--out', type=str, help='Directory to write to',
-required=True)
-parser.add_argument('--ids', type=str, help='File with ids', required=False)
-parser.add_argument('id', nargs='*', help='id(s)')
-args = parser.parse_args()
-
-dir = args.out
-if not os.path.exists(dir):
-    raise Exception(f"Directory {dir} does not exist")
-
-ids = args.id
-if (len(ids)==0):
-    print(f"Reading {args.ids}")
-    with open(args.ids) as f:
-        ids = [ l.strip() for l in f.readlines() ]
-
-for id in ids:
-    print(id)
-    jsonfn = dir+"/"+id+".json"
-    if not os.path.exists(jsonfn):
-        count = 0
-        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-        while not r:
-            count += 1
-            if count>10: raise Exception(f"Can not find record for {id}")
-            time.sleep(15)
-            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-        m_url = r.json()[0]['metadata']
-        mr = requests.get(m_url)
-        with open(dir+"/"+id+".json","w") as outf:
-            outf.write(mr.text)
-        if args.fasta:
-            fastafn = dir+"/"+id+".fa"
-            if os.path.exists(fastafn): continue
-            fa_url = r.json()[0]['fasta']
-            fr = requests.get(fa_url)
-            with open(fastafn,"w") as outf:
-                outf.write(fr.text)
-
diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
deleted file mode 100755
index f5920ec..0000000
--- a/workflows/tools/pubseq-fetch-ids
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env ruby
-#
-# Use a SPARQL query to fetch all IDs in the PubSeq database
-#
-#   pubseq-fetch-ids > pubseq_ids.txt
-#
-# Note: requires Ruby 3.x. Older Ruby gives a syntax error
-
-require 'net/http'
-require 'json'
-require 'ostruct'
-require 'erb'
-require 'pp'
-
-MAX=5_000
-
-SPARQL_HEADER="
-prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-prefix dc: <http://purl.org/dc/terms/>
-prefix schema: <https://schema.org/>
-PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-"
-
-# Build a SPARQL query, submit and return results. Apply transform
-# lambda when passed in
-def sparql query, transform = nil
-  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
-
-  response = Net::HTTP.get_response(URI.parse(api_url))
-  data = JSON.parse(response.body,symbolize_names: true)
-  data => { head: { vars: }, results: { bindings: results} }
-  vars = vars.map { |v| v.to_sym }
-  results.map { |rec|
-    # return results after transforming to a Hash and applying the
-    # optional transform lambda. Note the transform can not only
-    # reduce results, or create an array, but also may transform into
-    # an OpenStruct.
-    res = {}
-    vars.each { |name| res[name] = rec[name][:value] }
-    if transform
-      transform.call(res)
-    else
-      res
-    end
-  }
-end
-
-start = 0
-num = MAX
-begin
-  query = "
-SELECT DISTINCT ?id
-FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
-WHERE {
-
-  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
-
-} LIMIT #{num} OFFSET #{start}
-"
-  list = sparql(query, lambda { |rec| rec[:id] })
-  list.each do | l |
-    print(l,"\n")
-  end
-  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
-  start += num
-end while list.size == MAX
-- 
cgit v1.2.3