From 3c09a92423408d01b64e1b842c6b96778939d098 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 07:11:13 +0000
Subject: Rename

---
 workflows/tools/pubseq-fetch-ids | 67 ++++++++++++++++++++++++++++++++++++++++
 workflows/tools/sparql-fetch-ids | 67 ----------------------------------------
 2 files changed, 67 insertions(+), 67 deletions(-)
 create mode 100755 workflows/tools/pubseq-fetch-ids
 delete mode 100755 workflows/tools/sparql-fetch-ids

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
new file mode 100755
index 0000000..19b2d82
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-ids
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+#   sparql-fetch-ids > pubseq_ids.txt
+#
+# Note: requires Ruby 3.x. Older Ruby gives a syntax error
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+require 'pp'
+
+MAX=5_000
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
+def sparql query, transform = nil
+  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
+
+  response = Net::HTTP.get_response(URI.parse(api_url))
+  data = JSON.parse(response.body,symbolize_names: true)
+  data => { head: { vars: }, results: { bindings: results} }
+  vars = vars.map { |v| v.to_sym }
+  results.map { |rec|
+    # return results after transforming to a Hash and applying the
+    # optional transform lambda. Note the transform can not only
+    # reduce results, or create an array, but also may transform into
+    # an OpenStruct.
+    res = {}
+    vars.each { |name| res[name] = rec[name][:value] }
+    if transform
+      transform.call(res)
+    else
+      res
+    end
+  }
+end
+
+start = 0
+num = MAX
+begin
+  query = "
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
+
+  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} LIMIT #{num} OFFSET #{start}
+"
+  list = sparql(query, lambda { |rec| rec[:id] })
+  list.each do | l |
+    print(l,"\n")
+  end
+  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
+  start += num
+end while list.size == MAX
diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids
deleted file mode 100755
index 19b2d82..0000000
--- a/workflows/tools/sparql-fetch-ids
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env ruby
-#
-# Use a SPARQL query to fetch all IDs in the PubSeq database
-#
-#   sparql-fetch-ids > pubseq_ids.txt
-#
-# Note: requires Ruby 3.x. Older Ruby gives a syntax error
-
-require 'net/http'
-require 'json'
-require 'ostruct'
-require 'erb'
-require 'pp'
-
-MAX=5_000
-
-SPARQL_HEADER="
-prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-prefix dc: <http://purl.org/dc/terms/>
-prefix schema: <https://schema.org/>
-PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-"
-
-# Build a SPARQL query, submit and return results. Apply transform
-# lambda when passed in
-def sparql query, transform = nil
-  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
-
-  response = Net::HTTP.get_response(URI.parse(api_url))
-  data = JSON.parse(response.body,symbolize_names: true)
-  data => { head: { vars: }, results: { bindings: results} }
-  vars = vars.map { |v| v.to_sym }
-  results.map { |rec|
-    # return results after transforming to a Hash and applying the
-    # optional transform lambda. Note the transform can not only
-    # reduce results, or create an array, but also may transform into
-    # an OpenStruct.
-    res = {}
-    vars.each { |name| res[name] = rec[name][:value] }
-    if transform
-      transform.call(res)
-    else
-      res
-    end
-  }
-end
-
-start = 0
-num = MAX
-begin
-  query = "
-SELECT DISTINCT ?id
-FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
-WHERE {
-
-  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
-
-} LIMIT #{num} OFFSET #{start}
-"
-  list = sparql(query, lambda { |rec| rec[:id] })
-  list.each do | l |
-    print(l,"\n")
-  end
-  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
-  start += num
-end while list.size == MAX
-- 
cgit 1.4.1


From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 09:34:26 +0000
Subject: API: fix returned record to include original metadata

---
 bh20simplewebuploader/api.py          | 34 ++++++++++++++++++++++++++++++++--
 test/rest-api.org                     | 29 +++++++++++++++++++++++++++++
 workflows/pull-data/genbank/README.md | 12 ++++++++++--
 workflows/tools/pubseq-fetch-ids      |  2 +-
 4 files changed, 72 insertions(+), 5 deletions(-)

(limited to 'workflows')

diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py
index b1b505f..11c74f2 100644
--- a/bh20simplewebuploader/api.py
+++ b/bh20simplewebuploader/api.py
@@ -7,6 +7,9 @@ import sys
 from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
 from bh20simplewebuploader.main import app, sparqlURL
 
+PUBSEQ="http://covid19.genenetwork.org"
+ARVADOS="https://collections.lugli.arvadosapi.com/c="
+
 # Helper functions
 
 def fetch_sample_metadata(id):
@@ -42,13 +45,40 @@ def version():
 
 @app.route('/api/sample/<id>.json')
 def sample(id):
+    """
+
+API sample should return a record pointing to other resources,
+notably: permalink, original metadata record and the fasta
+data.
+
+curl http://localhost:5067/api/sample/MT533203.1.json
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "date": "2020-04-27",
+    "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence",
+    "mapper": "minimap v. 2.17",
+    "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml",
+    "permalink": "http://covid19.genenetwork.org/resource/MT533203.1",
+    "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632",
+    "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831"
+  }
+]
+
+
+"""
     # metadata = file.name(seq)+"/metadata.yaml"
     meta = fetch_sample_metadata(id)
     print(meta)
+    # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml
     return jsonify([{
         'id': x['id']['value'],
-        'fasta': x['seq']['value'],
-        'collection': os.path.dirname(x['seq']['value']),
+        'collection': x['seq']['value'],
+        'permalink': PUBSEQ+'/resource/'+x['id']['value'],
+        'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta',
+        'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml',
         'date': x['date']['value'],
         'info': x['info']['value'],
         'specimen': x['specimen']['value'],
diff --git a/test/rest-api.org b/test/rest-api.org
index 66639c3..2ea2b11 100644
--- a/test/rest-api.org
+++ b/test/rest-api.org
@@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version
 }
 #+end_src
 
+The current API can fetch data
+
+#+begin_src js
+curl http://covid19.genenetwork.org/api/search?s=MT533203.1
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource",
+    "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence"
+  }
+]
+
+curl http://covid19.genenetwork.org/api/sample/MT533203.1.json
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource",
+    "date": "2020-04-27",
+    "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence",
+    "mapper": "minimap v. 2.17",
+    "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632",
+    "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831"
+  }
+]
+#+end_src
+
+
 The Python3 version is
 
 #+begin_src python :session :exports both
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index 5464d1d..188ff6f 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq
 
 ```sh
 # --- get list of IDs already in PubSeq
-../../tools/sparql-fetch-ids > pubseq_ids.txt
+../../tools/pubseq-fetch-ids > pubseq_ids.txt
+
 # --- get list of missing genbank IDs
 python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
 
@@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s
 
 ```
 
+## Validate GenBank data
+
+To pull the data from PubSeq use the list of pubseq ids generated
+above.
+
+
+
 # TODO
 
-- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
+- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
index 19b2d82..f5920ec 100755
--- a/workflows/tools/pubseq-fetch-ids
+++ b/workflows/tools/pubseq-fetch-ids
@@ -2,7 +2,7 @@
 #
 # Use a SPARQL query to fetch all IDs in the PubSeq database
 #
-#   sparql-fetch-ids > pubseq_ids.txt
+#   pubseq-fetch-ids > pubseq_ids.txt
 #
 # Note: requires Ruby 3.x. Older Ruby gives a syntax error
 
-- 
cgit 1.4.1


From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:07:39 +0000
Subject: fetches original metadata from PubSeq/Arvados

---
 workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100755 workflows/tools/pubseq-fetch-data.py

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
new file mode 100755
index 0000000..c22d754
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import sys
+
+parser = argparse.ArgumentParser(description="""
+
+Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
+can be passed in on the command line or in a file.
+
+""")
+parser.add_argument('--out', type=str, help='Directory to write to',
+required=True)
+parser.add_argument('--ids', type=str, help='File with ids', required=False)
+parser.add_argument('id', nargs='*', help='id(s)')
+args = parser.parse_args()
+
+dir = args.out
+if not os.path.exists(dir):
+    raise Exception(f"Directory {dir} does not exist")
+
+ids = args.id
+if (len(ids)==0):
+    print(f"Reading {args.ids}")
+    with open(args.ids) as f:
+        ids = [ l.strip() for l in f.readlines() ]
+
+for id in ids[0:2]:
+    print(id)
+    r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+    if r:
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        meta = mr.json()
+        with open(dir+"/"+id+".json","w") as outf:
+            json.dump(meta, outf, indent=4)
+    else:
+        raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:29:41 +0000
Subject: fetch: do a straight dump of the original record

---
 workflows/tools/pubseq-fetch-data.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index c22d754..3f5e6cf 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -28,14 +28,13 @@ if (len(ids)==0):
     with open(args.ids) as f:
         ids = [ l.strip() for l in f.readlines() ]
 
-for id in ids[0:2]:
+for id in ids:
     print(id)
     r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
     if r:
         m_url = r.json()[0]['metadata']
         mr = requests.get(m_url)
-        meta = mr.json()
         with open(dir+"/"+id+".json","w") as outf:
-            json.dump(meta, outf, indent=4)
+            outf.write(mr.text)
     else:
         raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 12:35:05 +0000
Subject: Add option for fetching fasta

---
 workflows/tools/pubseq-fetch-data.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 3f5e6cf..23c4dea 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
 can be passed in on the command line or in a file.
 
 """)
+parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
 parser.add_argument('--out', type=str, help='Directory to write to',
 required=True)
 parser.add_argument('--ids', type=str, help='File with ids', required=False)
@@ -36,5 +37,10 @@ for id in ids:
         mr = requests.get(m_url)
         with open(dir+"/"+id+".json","w") as outf:
             outf.write(mr.text)
+        if args.fasta:
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(dir+"/"+id+".fa","w") as outf:
+                outf.write(fr.text)
     else:
         raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 07:13:15 -0600
Subject: Fetching fixes

---
 workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 23c4dea..2119fdf 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -31,16 +31,20 @@ if (len(ids)==0):
 
 for id in ids:
     print(id)
-    r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-    if r:
-        m_url = r.json()[0]['metadata']
-        mr = requests.get(m_url)
-        with open(dir+"/"+id+".json","w") as outf:
-            outf.write(mr.text)
-        if args.fasta:
-            fa_url = r.json()[0]['fasta']
-            fr = requests.get(fa_url)
-            with open(dir+"/"+id+".fa","w") as outf:
-                outf.write(fr.text)
-    else:
-        raise Exception(f"Can not find record for {id}")
+    jsonfn = dir+"/"+id+".json"
+    if not os.path.exists(jsonfn):
+        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        if r:
+            m_url = r.json()[0]['metadata']
+            mr = requests.get(m_url)
+            with open(dir+"/"+id+".json","w") as outf:
+                outf.write(mr.text)
+            if args.fasta:
+                fastafn = dir+"/"+id+".fa"
+                if os.path.exists(fastafn): continue
+                fa_url = r.json()[0]['fasta']
+                fr = requests.get(fa_url)
+                with open(fastafn,"w") as outf:
+                    outf.write(fr.text)
+        else:
+            raise Exception(f"Can not find record for {id}")
-- 
cgit 1.4.1


From f7666a7766c8138aa690340fc68cb67f709327f3 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Tue, 5 Jan 2021 17:35:46 +0100
Subject: cleaning genbank-fetch-ids.py

---
 workflows/pull-data/genbank/genbank-fetch-ids.py | 40 +++++++++++-------------
 1 file changed, 19 insertions(+), 21 deletions(-)

(limited to 'workflows')

diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index 1962daa..cb48cd8 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -6,28 +6,20 @@
 #
 # See also directory .guix-run and README.md
 
-BATCH_SIZE=5000
-
 import argparse
-import json
-import os
-import requests
 import sys
-import xml.etree.ElementTree as ET
-from datetime import date, datetime
-from dateutil.parser import parse
+from datetime import date
+
+from Bio import Entrez
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--max', type=int, help='Max queries', required=False)
 parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False)
 args = parser.parse_args()
 
-from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-
-# min_acceptable_collection_date = datetime(2019, 12, 1)
+BATCH_SIZE = 5000
 
-today_date = date.today().strftime("%Y.%m.%d")
+Entrez.email = 'another_email@gmail.com'  # FIXME
 
 skip = set()
 if args.skip:
@@ -36,10 +28,11 @@ if args.skip:
         for line in content:
             skip.add(line.strip())
 
-print(f"Skip size is {len(skip)}",file=sys.stderr)
+print(f"Skip size is {len(skip)}", file=sys.stderr)
 
 # Try to search several strings
 TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
+
 # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with
 PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']
 
@@ -47,22 +40,27 @@ ids = set()
 for term in TERMS:
     num_read = BATCH_SIZE
     retstart = 0
+
     while num_read == BATCH_SIZE:
         record = Entrez.read(
-            Entrez.esearch(db='nuccore', term=term, idtype='acc',
-                           retstart=retstart, retmax=BATCH_SIZE)
+            Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE)
         )
+
         idlist = record['IdList']
         new_ids = set(idlist)
         num_read = len(new_ids)
-        print(num_read,":",idlist[0],file=sys.stderr)
         retstart += num_read
-        new_ids.difference_update(skip) # remove skip ids
+
+        print(num_read, ":", idlist[0], file=sys.stderr)
+
+        new_ids.difference_update(skip)  # remove skip ids
         new_ids = set([id for id in new_ids if id[:2] not in PREFIX])
-        ids.update(new_ids)             # add to total set
-        print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr)
+        ids.update(new_ids)  # add to total set
+
+        print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr)
+
         if args.max and len(ids) > args.max:
-            print(f"Stopping past #{args.max} items",file=sys.stderr)
+            print(f"Stopping past #{args.max} items", file=sys.stderr)
             break
 
 for id in ids:
-- 
cgit 1.4.1


From 911ba372cfc4b35c5b52d18a573a636ea78d16d7 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Tue, 5 Jan 2021 17:56:19 +0100
Subject: cleaning update-from-genbank.py; removed unused import from
 genbank-fetch-ids.py

---
 workflows/pull-data/genbank/genbank-fetch-ids.py   |  1 -
 workflows/pull-data/genbank/update-from-genbank.py | 25 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'workflows')

diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index cb48cd8..e9e7315 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -8,7 +8,6 @@
 
 import argparse
 import sys
-from datetime import date
 
 from Bio import Entrez
 
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index dca5563..95f5a93 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -14,22 +14,21 @@ import sys
 from utils import chunks
 
 from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
 
-BATCH=100
+Entrez.email = 'another_email@gmail.com'  # FIXME
+
+BATCH = 100
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--max', type=int, help='Max queries', required=False)
 parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
 parser.add_argument('--out', type=str, help='Directory to write to', required=True)
+parser.add_argument('--max', type=int, help='Max queries', required=False)
 
 args = parser.parse_args()
 
 ids = set()
 with open(args.ids) as f:
-    content = f.readlines()
-    for line in content:
-        ids.add(line.strip())
+    ids.update([line.strip() for line in f])
 
 dir = args.out
 if not os.path.exists(dir):
@@ -37,12 +36,14 @@ if not os.path.exists(dir):
 
 request_num = BATCH
 if args.max:
-  request_num = min(BATCH,args.max)
+    request_num = min(BATCH, args.max)
+
+for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
+    xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
+    print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)
 
-for i, idsx in enumerate(chunks(list(ids), request_num)):
-    xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
-    print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
     with gzip.open(xmlfn, 'w') as f:
-        f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
-    if args.max and i*request_num >= args.max:
+        f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())
+
+    if args.max and num_chunk * request_num >= args.max:
         break
-- 
cgit 1.4.1


From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 6 Jan 2021 02:33:35 -0600
Subject: Pubseq fetch: sometimes a request times out. So repeat with
 intervals.

---
 workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'workflows')

diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 2119fdf..ef4edde 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -5,6 +5,7 @@ import json
 import os
 import requests
 import sys
+import time
 
 parser = argparse.ArgumentParser(description="""
 
@@ -33,18 +34,22 @@ for id in ids:
     print(id)
     jsonfn = dir+"/"+id+".json"
     if not os.path.exists(jsonfn):
+        count = 0
         r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-        if r:
-            m_url = r.json()[0]['metadata']
-            mr = requests.get(m_url)
-            with open(dir+"/"+id+".json","w") as outf:
-                outf.write(mr.text)
-            if args.fasta:
-                fastafn = dir+"/"+id+".fa"
-                if os.path.exists(fastafn): continue
-                fa_url = r.json()[0]['fasta']
-                fr = requests.get(fa_url)
-                with open(fastafn,"w") as outf:
-                    outf.write(fr.text)
-        else:
-            raise Exception(f"Can not find record for {id}")
+        while not r:
+            count += 1
+            if count>10: raise Exception(f"Can not find record for {id}")
+            time.sleep(15)
+            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        with open(dir+"/"+id+".json","w") as outf:
+            outf.write(mr.text)
+        if args.fasta:
+            fastafn = dir+"/"+id+".fa"
+            if os.path.exists(fastafn): continue
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(fastafn,"w") as outf:
+                outf.write(fr.text)
+
-- 
cgit 1.4.1


From 27a2b926036211469eccbf8c3d9580182482bdc2 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Wed, 6 Jan 2021 14:13:13 +0100
Subject: cleaned utils.py

---
 workflows/pull-data/genbank/utils.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'workflows')

diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py
index 3efc67a..96920a5 100644
--- a/workflows/pull-data/genbank/utils.py
+++ b/workflows/pull-data/genbank/utils.py
@@ -1,5 +1,6 @@
 import os
 
+
 def is_integer(string_to_check):
     try:
         int(string_to_check)
@@ -7,19 +8,26 @@ def is_integer(string_to_check):
     except ValueError:
         return False
 
+
 def chunks(lst, n):
     for i in range(0, len(lst), n):
         yield lst[i:i + n]
 
+
 def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
-    # Check duplicated entry looking at all dictionaries
+    """
+    Check duplicated entry by looking in all dictionaries
+    """
+
     field_to_term_to_uri_dict = {}
 
-    path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
-                              os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+    path_dict_xxx_csv_list = [
+        os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+        os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
+    ]
 
     for path_dict_xxx_csv in path_dict_xxx_csv_list:
-        print('Read {}'.format(path_dict_xxx_csv))
+        print(f'Read {path_dict_xxx_csv}')
 
         with open(path_dict_xxx_csv) as f:
             for line in f:
@@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
                 term = term.strip('"')
 
                 if term in field_to_term_to_uri_dict:
-                    print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                    print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
                     continue
 
                 field_to_term_to_uri_dict[term] = uri
@@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
                 term = term.strip('"')
 
                 if term in field_to_term_to_uri_dict[field]:
-                    print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+                    print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
                     continue
 
                 field_to_term_to_uri_dict[field][term] = uri
 
-    return field_to_term_to_uri_dict
\ No newline at end of file
+    return field_to_term_to_uri_dict
-- 
cgit 1.4.1


From 329a1a7e122eda41016185d1b1e8d50d97f8857b Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Thu, 7 Jan 2021 03:17:47 -0600
Subject: Allow for xml and xml.gz files

---
 workflows/pull-data/genbank/genbank.py             |  3 +-
 .../genbank/transform-genbank-xml2yamlfa.py        | 78 ++++++++++++----------
 2 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'workflows')

diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 85d615c..026c03f 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -111,7 +111,8 @@ def get_metadata(id, gbseq):
         # print(n,file=sys.stderr)
         if n != 'Unpublished':
             institute,address = n.split(',',1)
-            submitter.submitter_name = institute.split(') ')[1]
+            if ")" in institute:
+                submitter.submitter_name = institute.split(')')[1]
             submitter.submitter_address = address.strip()
     except AttributeError:
         pass
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index 9414864..1a8035d 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -33,43 +33,47 @@ states = {}
 
 for xmlfn in args.files:
     print(f"--- Reading {xmlfn}")
-    with gzip.open(xmlfn, 'r') as f:
-        xml = f.read().decode()
-        tree = ET.fromstring(xml)
-        for gb in tree.findall('./GBSeq'):
-            valid = None
-            error = None
-            meta = {}
-            id = gb.find("GBSeq_locus").text
-            basename = dir+"/"+id
-            print(f"    parsing {id}")
-            try:
-                valid,meta = genbank.get_metadata(id,gb)
-                if valid:
-                    # --- write JSON
-                    jsonfn = basename + ".json"
-                    with open(jsonfn, 'w') as outfile:
-                        print(f"    writing {jsonfn}")
-                        json.dump(meta, outfile, indent=4)
-                    # --- write FASTA
-                    fa = basename+".fa"
-                    seq = genbank.get_sequence(id,gb)
-                    print(f"    writing {fa}")
-                    with open(fa,"w") as f2:
-                        f2.write(f"> {id}\n")
-                        f2.write(seq)
-                    # print(seq)
-            except genbank.GBError as e:
-                error = f"{e} for {id}"
-                print(error,file=sys.stderr)
-                valid = False
-            state = {}
-            state['valid'] = valid
-            if error:
-                state['error'] = error
-            if meta['warnings']:
-                state['warnings'] = meta['warnings']
-            states[id] = state
+    try:
+        with gzip.open(xmlfn, 'r') as f:
+            xml = f.read().decode()
+    except Exception:
+        with open(xmlfn, 'r') as f:
+            xml = f.read()
+    tree = ET.fromstring(xml)
+    for gb in tree.findall('./GBSeq'):
+        valid = None
+        error = None
+        meta = {}
+        id = gb.find("GBSeq_locus").text
+        basename = dir+"/"+id
+        print(f"    parsing {id}")
+        try:
+            valid,meta = genbank.get_metadata(id,gb)
+            if valid:
+                # --- write JSON
+                jsonfn = basename + ".json"
+                with open(jsonfn, 'w') as outfile:
+                    print(f"    writing {jsonfn}")
+                    json.dump(meta, outfile, indent=4)
+                # --- write FASTA
+                fa = basename+".fa"
+                seq = genbank.get_sequence(id,gb)
+                print(f"    writing {fa}")
+                with open(fa,"w") as f2:
+                    f2.write(f"> {id}\n")
+                    f2.write(seq)
+                # print(seq)
+        except genbank.GBError as e:
+            error = f"{e} for {id}"
+            print(error,file=sys.stderr)
+            valid = False
+        state = {}
+        state['valid'] = valid
+        if error:
+            state['error'] = error
+        if meta['warnings']:
+            state['warnings'] = meta['warnings']
+        states[id] = state
 
 statefn = dir + '/state.json'
 with open(statefn, 'w') as outfile:
-- 
cgit 1.4.1