From 3c09a92423408d01b64e1b842c6b96778939d098 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:11:13 +0000 Subject: Rename --- workflows/tools/pubseq-fetch-ids | 67 ++++++++++++++++++++++++++++++++++++++++ workflows/tools/sparql-fetch-ids | 67 ---------------------------------------- 2 files changed, 67 insertions(+), 67 deletions(-) create mode 100755 workflows/tools/pubseq-fetch-ids delete mode 100755 workflows/tools/sparql-fetch-ids diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids new file mode 100755 index 0000000..19b2d82 --- /dev/null +++ b/workflows/tools/pubseq-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# sparql-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM +WHERE { + + ?arvid ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids deleted file mode 100755 index 19b2d82..0000000 --- a/workflows/tools/sparql-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# sparql-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX -- cgit v1.2.3 From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 09:34:26 +0000 Subject: API: fix returned record to include original metadata --- bh20simplewebuploader/api.py | 34 ++++++++++++++++++++++++++++++++-- test/rest-api.org | 29 +++++++++++++++++++++++++++++ workflows/pull-data/genbank/README.md | 12 ++++++++++-- workflows/tools/pubseq-fetch-ids | 2 +- 4 files changed, 72 insertions(+), 5 deletions(-) diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index b1b505f..11c74f2 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -7,6 +7,9 @@ import sys from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL +PUBSEQ="http://covid19.genenetwork.org" +ARVADOS="https://collections.lugli.arvadosapi.com/c=" + # Helper functions def fetch_sample_metadata(id): @@ -42,13 +45,40 @@ def version(): @app.route('/api/sample/.json') def sample(id): + """ + +API sample should return a record pointing to other resources, +notably: permalink, original metadata record and the fasta +data. + +curl http://localhost:5067/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] + + +""" # metadata = file.name(seq)+"/metadata.yaml" meta = fetch_sample_metadata(id) print(meta) + # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml return jsonify([{ 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), + 'collection': x['seq']['value'], + 'permalink': PUBSEQ+'/resource/'+x['id']['value'], + 'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta', + 'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml', 'date': x['date']['value'], 'info': x['info']['value'], 'specimen': x['specimen']['value'], diff --git a/test/rest-api.org b/test/rest-api.org index 66639c3..2ea2b11 100644 --- a/test/rest-api.org +++ b/test/rest-api.org @@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version } #+end_src +The current API can fetch data + +#+begin_src js +curl http://covid19.genenetwork.org/api/search?s=MT533203.1 +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence" + } +] + +curl http://covid19.genenetwork.org/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "date": "2020-04-27", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] +#+end_src + + The Python3 version is #+begin_src python :session :exports both diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/pubseq-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error -- cgit v1.2.3 From b26133cdaff3bba64c4fc294510b16d57030d071 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 10:42:16 +0000 Subject: api: more cleanup --- bh20simplewebuploader/api.py | 122 ++++++++++++++++++++---------------------- bh20simplewebuploader/main.py | 3 +- 2 files changed, 61 insertions(+), 64 deletions(-) diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index 11c74f2..761ad03 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -3,6 +3,7 @@ import os import requests import sys +import types from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL @@ -12,14 +13,16 @@ ARVADOS="https://collections.lugli.arvadosapi.com/c=" # Helper functions -def fetch_sample_metadata(id): - query = """ +def fetch_sample(id, query=None): + default_query = """ + PREFIX pubseq: PREFIX sio: PREFIX edam: PREFIX efo: PREFIX evs: PREFIX obo: + select distinct ?id ?seq ?date ?info ?specimen ?sequencer ?mapper { ?sample sio:SIO_000115 "%s" ; @@ -27,15 +30,49 @@ def fetch_sample_metadata(id): evs:C25164 ?date . ?seq pubseq:technology ?tech ; pubseq:sample ?sample . - ?tech efo:EFO_0002699 ?mapper ; - obo:OBI_0600047 ?sequencer . + optional { ?tech efo:EFO_0002699 ?mapper } . + optional { ?tech obo:OBI_0600047 ?sequencer . } optional { ?sample edam:data_2091 ?info } . optional { ?sample obo:OBI_0001479 ?specimen } . } limit 5 + """ % id + if not query: query = default_query + print(query) payload = {'query': query, 'format': 'json'} r = requests.get(sparqlURL, params=payload) - return r.json()['results']['bindings'] + res = r.json() + print(res) + return res['results']['bindings'],res['head']['vars'] + +def fetch_one_sample(id, query=None): + """Get the top sample and return a SimpleNamespace""" + + result,varlist = fetch_sample(id,query) + h = {} + row = result[0] + for key in varlist: + if key in row: + h[key] = row[key]['value'] + print(h) + h['arv_id'] = os.path.basename(h['seq']) + return types.SimpleNamespace(**h) + +def fetch_one_record(id): + m = fetch_one_sample(id) + arv_id = m.arv_id + rec = { "id": id, + 'arv_id': arv_id, + "permalink": PUBSEQ+'/resource/'+id, + "collection": m.seq, + 'collection_date': m.date, + 'fasta': ARVADOS+arv_id+'/sequence.fasta', + 'metadata': ARVADOS+arv_id+'/metadata.yaml', + } + h = m.__dict__ # for optional items + if 'mapper' in h: rec['mapper'] = m.mapper + if 'sequencer' in h: rec['sequencer']= m.sequencer + return rec # Main API routes @@ -52,74 +89,33 @@ notably: permalink, original metadata record and the fasta data. curl http://localhost:5067/api/sample/MT533203.1.json -[ - { - "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", - "date": "2020-04-27", - "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", - "id": "MT533203.1", - "info": "http://identifiers.org/insdc/MT533203.1#sequence", - "mapper": "minimap v. 2.17", - "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", - "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", - "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", - "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" - } -] - +{ + "id": "MT533203.1", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "collection_date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632" +} """ - # metadata = file.name(seq)+"/metadata.yaml" - meta = fetch_sample_metadata(id) - print(meta) - # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml - return jsonify([{ - 'id': x['id']['value'], - 'collection': x['seq']['value'], - 'permalink': PUBSEQ+'/resource/'+x['id']['value'], - 'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta', - 'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml', - 'date': x['date']['value'], - 'info': x['info']['value'], - 'specimen': x['specimen']['value'], - 'sequencer': x['sequencer']['value'], - 'mapper': x['mapper']['value'], - } for x in meta]) + + return jsonify([fetch_one_record(id)]) @app.route('/api/ebi/sample-.xml', methods=['GET']) def ebi_sample(id): - meta = fetch_sample_metadata(id)[0] + meta,varlist = fetch_sample(id)[0] page = render_template('ebi-sample.xml',sampleid=id,sequencer=meta['sequencer']['value'],date=meta['date']['value'],specimen=meta['specimen']['value']) return page @app.route('/api/search', methods=['GET']) def search(): """ - Execute a 'global search' + Execute a 'global search'. Currently just duplicates fetch one + sample. Should be more flexible FIXME. """ s = request.args.get('s') - if s == "": - s = "MT326090.1" - query = """ - PREFIX pubseq: - PREFIX sio: - PREFIX edam: - select distinct ?id ?seq ?info - { - ?sample sio:SIO_000115 "%s" . - ?sample sio:SIO_000115 ?id . - ?seq pubseq:sample ?sample . - ?sample edam:data_2091 ?info . - } limit 100 - """ % s - payload = {'query': query, 'format': 'json'} - r = requests.get(sparqlURL, params=payload) - result = r.json()['results']['bindings'] - # metadata = file.name(seq)+"/metadata.yaml" - print(result) - return jsonify([{ - 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), - 'info': x['info']['value'], - } for x in result]) + if s == "": s = "MT326090.1" + return jsonify([fetch_one_record(s)]) diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index b620946..b4b72d2 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -34,6 +34,7 @@ if not os.path.isfile('bh20sequploader/main.py'): print("WARNING: run FLASK from the root of the source repository!", file=sys.stderr) app = Flask(__name__, static_url_path='/static', static_folder='static') +app.config['JSON_SORT_KEYS'] = False # Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. # We will enforce the limit ourselves and set a higher safety limit here. @@ -252,7 +253,7 @@ FORM_ITEMS = load_schema_generate_form() def get_feed_items(name, start=0, stop=9): redis_client = redis.Redis(host=os.environ.get('HOST', 'localhost'), port=os.environ.get('PORT', 6379), - db=os.environ.get('REDIS_DB', 0)) + db=os.environ.get('REDIS_DB', 0)) feed_items = [] try: for el in redis_client.zrevrange(name, start, stop): -- cgit v1.2.3 From bcc2ea8521d0366753115546b30824a01757b570 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 10:55:22 +0000 Subject: web: show https --- bh20simplewebuploader/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index b4b72d2..0c58712 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -751,8 +751,8 @@ union # http://covid19.genenetwork.org/resource/lugli-4zz18-gx0ifousk9yu0ql m = re.match(r"http://collections.lugli.arvadosapi.com/c=([^/]*)/sequence.fasta|http://covid19.genenetwork.org/resource/(.*)", sequenceuri) collection = m.group(1) or m.group(2) - fastauri = f"http://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta" - metauri = f"http://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml" + fastauri = f"https://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta" + metauri = f"https://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml" locationuri=sample['geo']['value'] location=sample['geoname']['value'] date=sample['date']['value'] -- cgit v1.2.3 From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:07:39 +0000 Subject: fetches original metadata from PubSeq/Arvados --- workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 workflows/tools/pubseq-fetch-data.py diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..c22d754 --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids[0:2]: + print(id) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + meta = mr.json() + with open(dir+"/"+id+".json","w") as outf: + json.dump(meta, outf, indent=4) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:29:41 +0000 Subject: fetch: do a straight dump of the original record --- workflows/tools/pubseq-fetch-data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index c22d754..3f5e6cf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -28,14 +28,13 @@ if (len(ids)==0): with open(args.ids) as f: ids = [ l.strip() for l in f.readlines() ] -for id in ids[0:2]: +for id in ids: print(id) r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") if r: m_url = r.json()[0]['metadata'] mr = requests.get(m_url) - meta = mr.json() with open(dir+"/"+id+".json","w") as outf: - json.dump(meta, outf, indent=4) + outf.write(mr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:35:05 +0000 Subject: Add option for fetching fasta --- workflows/tools/pubseq-fetch-data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 3f5e6cf..23c4dea 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs can be passed in on the command line or in a file. """) +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') parser.add_argument('--out', type=str, help='Directory to write to', required=True) parser.add_argument('--ids', type=str, help='File with ids', required=False) @@ -36,5 +37,10 @@ for id in ids: mr = requests.get(m_url) with open(dir+"/"+id+".json","w") as outf: outf.write(mr.text) + if args.fasta: + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(dir+"/"+id+".fa","w") as outf: + outf.write(fr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:13:15 -0600 Subject: Fetching fixes --- workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 23c4dea..2119fdf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -31,16 +31,20 @@ if (len(ids)==0): for id in ids: print(id) - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(dir+"/"+id+".fa","w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From f7666a7766c8138aa690340fc68cb67f709327f3 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 5 Jan 2021 17:35:46 +0100 Subject: cleaning genbank-fetch-ids.py --- workflows/pull-data/genbank/genbank-fetch-ids.py | 40 +++++++++++------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index 1962daa..cb48cd8 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -6,28 +6,20 @@ # # See also directory .guix-run and README.md -BATCH_SIZE=5000 - import argparse -import json -import os -import requests import sys -import xml.etree.ElementTree as ET -from datetime import date, datetime -from dateutil.parser import parse +from datetime import date + +from Bio import Entrez parser = argparse.ArgumentParser() parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False) args = parser.parse_args() -from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME - -# min_acceptable_collection_date = datetime(2019, 12, 1) +BATCH_SIZE = 5000 -today_date = date.today().strftime("%Y.%m.%d") +Entrez.email = 'another_email@gmail.com' # FIXME skip = set() if args.skip: @@ -36,10 +28,11 @@ if args.skip: for line in content: skip.add(line.strip()) -print(f"Skip size is {len(skip)}",file=sys.stderr) +print(f"Skip size is {len(skip)}", file=sys.stderr) # Try to search several strings TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP'] @@ -47,22 +40,27 @@ ids = set() for term in TERMS: num_read = BATCH_SIZE retstart = 0 + while num_read == BATCH_SIZE: record = Entrez.read( - Entrez.esearch(db='nuccore', term=term, idtype='acc', - retstart=retstart, retmax=BATCH_SIZE) + Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE) ) + idlist = record['IdList'] new_ids = set(idlist) num_read = len(new_ids) - print(num_read,":",idlist[0],file=sys.stderr) retstart += num_read - new_ids.difference_update(skip) # remove skip ids + + print(num_read, ":", idlist[0], file=sys.stderr) + + new_ids.difference_update(skip) # remove skip ids new_ids = set([id for id in new_ids if id[:2] not in PREFIX]) - ids.update(new_ids) # add to total set - print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr) + ids.update(new_ids) # add to total set + + print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr) + if args.max and len(ids) > args.max: - print(f"Stopping past #{args.max} items",file=sys.stderr) + print(f"Stopping past #{args.max} items", file=sys.stderr) break for id in ids: -- cgit v1.2.3 From aa4b80f9dd8fbbcc2c7fb7f4869dc142fcf61ceb Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 5 Jan 2021 19:54:01 +0300 Subject: Pass extra param that contains all news items to the template * bh20simplewebuploader/main.py (send_home): Add extra all_items when rendering the template. --- bh20simplewebuploader/main.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index 0c58712..504f03c 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -273,12 +273,23 @@ def send_home(): """ Send the front page. """ + (tweets, + commits, + pubmed_articles, + arxiv_articles) = [get_feed_items(x) for x in ["bh20-tweet-score:", + "bh20-commit-score:", + "bh20-pubmed-score:", + "bh20-arxiv-score:"]] return render_template( 'home.html', menu='HOME', - tweets=get_feed_items("bh20-tweet-score:"), - commits=get_feed_items("bh20-commit-score:"), - pubmed_articles=get_feed_items("bh20-pubmed-score:"), - arxiv_articles=get_feed_items("bh20-arxiv-score:"), + all_items=list(itertools.chain(tweets, + commits, + pubmed_articles, + arxiv_articles)), + tweets=tweets, + commits=commits, + pubmed_articles=pubmed_articles, + arxiv_articles=arxiv_articles, load_map=True) -- cgit v1.2.3 From 911ba372cfc4b35c5b52d18a573a636ea78d16d7 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 5 Jan 2021 17:56:19 +0100 Subject: cleaning update-from-genbank.py; removed unused import from genbank-fetch-ids.py --- workflows/pull-data/genbank/genbank-fetch-ids.py | 1 - workflows/pull-data/genbank/update-from-genbank.py | 25 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index cb48cd8..e9e7315 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -8,7 +8,6 @@ import argparse import sys -from datetime import date from Bio import Entrez diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break -- cgit v1.2.3 From b0a3ce5ae0da2bbbc39ee2ebad57cd22393d1f5c Mon Sep 17 00:00:00 2001 From: BonfaceKilz Date: Tue, 5 Jan 2021 20:00:38 +0300 Subject: Add extra tab to display all items --- bh20simplewebuploader/static/main.css | 1 + bh20simplewebuploader/templates/home.html | 71 ++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css index fbc721e..e2f0c83 100644 --- a/bh20simplewebuploader/static/main.css +++ b/bh20simplewebuploader/static/main.css @@ -567,6 +567,7 @@ input[name="feed-tabs"] ~ .tab { display: none; } +#tab-all-items:checked ~ .tab.content-all-items, #tab-pubmed-articles:checked ~ .tab.content-pubmed-articles, #tab-arxiv-articles:checked ~ .tab.content-arxiv-articles, #tab-tweets:checked ~ .tab.content-tweets, diff --git a/bh20simplewebuploader/templates/home.html b/bh20simplewebuploader/templates/home.html index a880f81..23f48bf 100644 --- a/bh20simplewebuploader/templates/home.html +++ b/bh20simplewebuploader/templates/home.html @@ -29,7 +29,9 @@
- + + + @@ -37,6 +39,73 @@ +
    + + {% if all_items %} + {% for item in all_items|sort(reverse=true, attribute="score")%} +
  • + {% if item['authors'] %} + +

    + [arxiv] + + {{item['title']}} + +
    + Authors: {{ item['authors'] }} +
    + Abstract: {{ item['abstract']}}... +
    + Submitted: {{ item['submission']}} +

    + + {% elif item['full-authors'] %} + +

    [Pubmed]: + Summary: + {{ item['summary'] }} +
    + Full Authors: {{ item['full-authors'] }}
    + Short Authors: {{ item['short-authors'] }}
    + Citation: {{ item['citation'] }}
    + Short Journal Citation: {{ item['short-journal-citation'] }}
    +

    + + {% elif item['tweet'] %} + +

    + [Tweet]: + {{ item['tweet']|urlize(40, target="_blank")}} + + source +
    + by {{ item['author'] }} +
    + {{ item['timeposted'] }} +

    + + {% elif item['repository-url'] %} + +

    + [Commit]: + + {{ item.hash.split(":")[-1][:7] }}: {{ item.content }} + +
    + + {{ item.author }}/{{ item.repository }} + on {{ item.timeposted }} + +

    + {% endif %} +
  • + {%endfor%} + + {% else %} + There are no items to display :( + {% endif %} + +
    {% if arxiv_articles %} {% for article in arxiv_articles|sort(reverse=true, attribute="score")%} -- cgit v1.2.3 From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 6 Jan 2021 02:33:35 -0600 Subject: Pubseq fetch: sometimes a request times out. So repeat with intervals. --- workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 2119fdf..ef4edde 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -5,6 +5,7 @@ import json import os import requests import sys +import time parser = argparse.ArgumentParser(description=""" @@ -33,18 +34,22 @@ for id in ids: print(id) jsonfn = dir+"/"+id+".json" if not os.path.exists(jsonfn): + count = 0 r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fastafn = dir+"/"+id+".fa" - if os.path.exists(fastafn): continue - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(fastafn,"w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + -- cgit v1.2.3 From 27a2b926036211469eccbf8c3d9580182482bdc2 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 6 Jan 2021 14:13:13 +0100 Subject: cleaned utils.py --- workflows/pull-data/genbank/utils.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py index 3efc67a..96920a5 100644 --- a/workflows/pull-data/genbank/utils.py +++ b/workflows/pull-data/genbank/utils.py @@ -1,5 +1,6 @@ import os + def is_integer(string_to_check): try: int(string_to_check) @@ -7,19 +8,26 @@ def is_integer(string_to_check): except ValueError: return False + def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] + def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): - # Check duplicated entry looking at all dictionaries + """ + Check duplicated entry by looking in all dictionaries + """ + field_to_term_to_uri_dict = {} - path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + path_dict_xxx_csv_list = [ + os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv') + ] for path_dict_xxx_csv in path_dict_xxx_csv_list: - print('Read {}'.format(path_dict_xxx_csv)) + print(f'Read {path_dict_xxx_csv}') with open(path_dict_xxx_csv) as f: for line in f: @@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + print(f'Warning: in the dictionaries there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[term] = uri @@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict[field]: - print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[field][term] = uri - return field_to_term_to_uri_dict \ No newline at end of file + return field_to_term_to_uri_dict -- cgit v1.2.3 From 329a1a7e122eda41016185d1b1e8d50d97f8857b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 7 Jan 2021 03:17:47 -0600 Subject: Allow for xml and xml.gz files --- workflows/pull-data/genbank/genbank.py | 3 +- .../genbank/transform-genbank-xml2yamlfa.py | 78 ++++++++++++---------- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 85d615c..026c03f 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -111,7 +111,8 @@ def get_metadata(id, gbseq): # print(n,file=sys.stderr) if n != 'Unpublished': institute,address = n.split(',',1) - submitter.submitter_name = institute.split(') ')[1] + if ")" in institute: + submitter.submitter_name = institute.split(')')[1] submitter.submitter_address = address.strip() except AttributeError: pass diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index 9414864..1a8035d 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -33,43 +33,47 @@ states = {} for xmlfn in args.files: print(f"--- Reading {xmlfn}") - with gzip.open(xmlfn, 'r') as f: - xml = f.read().decode() - tree = ET.fromstring(xml) - for gb in tree.findall('./GBSeq'): - valid = None - error = None - meta = {} - id = gb.find("GBSeq_locus").text - basename = dir+"/"+id - print(f" parsing {id}") - try: - valid,meta = genbank.get_metadata(id,gb) - if valid: - # --- write JSON - jsonfn = basename + ".json" - with open(jsonfn, 'w') as outfile: - print(f" writing {jsonfn}") - json.dump(meta, outfile, indent=4) - # --- write FASTA - fa = basename+".fa" - seq = genbank.get_sequence(id,gb) - print(f" writing {fa}") - with open(fa,"w") as f2: - f2.write(f"> {id}\n") - f2.write(seq) - # print(seq) - except genbank.GBError as e: - error = f"{e} for {id}" - print(error,file=sys.stderr) - valid = False - state = {} - state['valid'] = valid - if error: - state['error'] = error - if meta['warnings']: - state['warnings'] = meta['warnings'] - states[id] = state + try: + with gzip.open(xmlfn, 'r') as f: + xml = f.read().decode() + except Exception: + with open(xmlfn, 'r') as f: + xml = f.read() + tree = ET.fromstring(xml) + for gb in tree.findall('./GBSeq'): + valid = None + error = None + meta = {} + id = gb.find("GBSeq_locus").text + basename = dir+"/"+id + print(f" parsing {id}") + try: + valid,meta = genbank.get_metadata(id,gb) + if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA + fa = basename+".fa" + seq = genbank.get_sequence(id,gb) + print(f" writing {fa}") + with open(fa,"w") as f2: + f2.write(f"> {id}\n") + f2.write(seq) + # print(seq) + except genbank.GBError as e: + error = f"{e} for {id}" + print(error,file=sys.stderr) + valid = False + state = {} + state['valid'] = valid + if error: + state['error'] = error + if meta['warnings']: + state['warnings'] = meta['warnings'] + states[id] = state statefn = dir + '/state.json' with open(statefn, 'w') as outfile: -- cgit v1.2.3 From c080c3cffedcc0cc99496b5e70fcfdf998978f16 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 7 Jan 2021 23:46:18 +0100 Subject: fixed comments in the example yaml files --- example/esr_example.yaml | 4 ++-- example/uthsc_example.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/esr_example.yaml b/example/esr_example.yaml index c97d0bf..c7bdb30 100644 --- a/example/esr_example.yaml +++ b/example/esr_example.yaml @@ -15,7 +15,7 @@ sample: collection_date: "2020-02-26" collection_location: https://www.wikidata.org/wiki/Q37100 specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] - source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] ?? + source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] additional_collection_information: Optional free text field for additional information virus: @@ -23,7 +23,7 @@ virus: virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 technology: - sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION alignment_protocol: https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" additional_technology_information: "Artic V3 workflow" diff --git a/example/uthsc_example.yaml b/example/uthsc_example.yaml index 661cf60..589a7a5 100644 --- a/example/uthsc_example.yaml +++ b/example/uthsc_example.yaml @@ -23,7 +23,7 @@ virus: virus_strain: SARS-CoV-2/human/USA/AL_UT14/2020 technology: - sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION alignment_protocol: guppy assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" additional_technology_information: Optional free text field for additional information -- cgit v1.2.3