From 3c09a92423408d01b64e1b842c6b96778939d098 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:11:13 +0000 Subject: Rename --- workflows/tools/pubseq-fetch-ids | 67 ++++++++++++++++++++++++++++++++++++++++ workflows/tools/sparql-fetch-ids | 67 ---------------------------------------- 2 files changed, 67 insertions(+), 67 deletions(-) create mode 100755 workflows/tools/pubseq-fetch-ids delete mode 100755 workflows/tools/sparql-fetch-ids (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids new file mode 100755 index 0000000..19b2d82 --- /dev/null +++ b/workflows/tools/pubseq-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# sparql-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM +WHERE { + + ?arvid ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids deleted file mode 100755 index 19b2d82..0000000 --- a/workflows/tools/sparql-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# sparql-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX -- cgit v1.2.3 From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 09:34:26 +0000 Subject: API: fix returned record to include original metadata --- bh20simplewebuploader/api.py | 34 ++++++++++++++++++++++++++++++++-- test/rest-api.org | 29 +++++++++++++++++++++++++++++ workflows/pull-data/genbank/README.md | 12 ++++++++++-- workflows/tools/pubseq-fetch-ids | 2 +- 4 files changed, 72 insertions(+), 5 deletions(-) (limited to 'workflows/tools') diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index b1b505f..11c74f2 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -7,6 +7,9 @@ import sys from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL +PUBSEQ="http://covid19.genenetwork.org" +ARVADOS="https://collections.lugli.arvadosapi.com/c=" + # Helper functions def fetch_sample_metadata(id): @@ -42,13 +45,40 @@ def version(): @app.route('/api/sample/.json') def sample(id): + """ + +API sample should return a record pointing to other resources, +notably: permalink, original metadata record and the fasta +data. + +curl http://localhost:5067/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] + + +""" # metadata = file.name(seq)+"/metadata.yaml" meta = fetch_sample_metadata(id) print(meta) + # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml return jsonify([{ 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), + 'collection': x['seq']['value'], + 'permalink': PUBSEQ+'/resource/'+x['id']['value'], + 'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta', + 'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml', 'date': x['date']['value'], 'info': x['info']['value'], 'specimen': x['specimen']['value'], diff --git a/test/rest-api.org b/test/rest-api.org index 66639c3..2ea2b11 100644 --- a/test/rest-api.org +++ b/test/rest-api.org @@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version } #+end_src +The current API can fetch data + +#+begin_src js +curl http://covid19.genenetwork.org/api/search?s=MT533203.1 +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence" + } +] + +curl http://covid19.genenetwork.org/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "date": "2020-04-27", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] +#+end_src + + The Python3 version is #+begin_src python :session :exports both diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/pubseq-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error -- cgit v1.2.3 From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:07:39 +0000 Subject: fetches original metadata from PubSeq/Arvados --- workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 workflows/tools/pubseq-fetch-data.py (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..c22d754 --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids[0:2]: + print(id) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + meta = mr.json() + with open(dir+"/"+id+".json","w") as outf: + json.dump(meta, outf, indent=4) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:29:41 +0000 Subject: fetch: do a straight dump of the original record --- workflows/tools/pubseq-fetch-data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index c22d754..3f5e6cf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -28,14 +28,13 @@ if (len(ids)==0): with open(args.ids) as f: ids = [ l.strip() for l in f.readlines() ] -for id in ids[0:2]: +for id in ids: print(id) r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") if r: m_url = r.json()[0]['metadata'] mr = requests.get(m_url) - meta = mr.json() with open(dir+"/"+id+".json","w") as outf: - json.dump(meta, outf, indent=4) + outf.write(mr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:35:05 +0000 Subject: Add option for fetching fasta --- workflows/tools/pubseq-fetch-data.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 3f5e6cf..23c4dea 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs can be passed in on the command line or in a file. """) +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') parser.add_argument('--out', type=str, help='Directory to write to', required=True) parser.add_argument('--ids', type=str, help='File with ids', required=False) @@ -36,5 +37,10 @@ for id in ids: mr = requests.get(m_url) with open(dir+"/"+id+".json","w") as outf: outf.write(mr.text) + if args.fasta: + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(dir+"/"+id+".fa","w") as outf: + outf.write(fr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:13:15 -0600 Subject: Fetching fixes --- workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 23c4dea..2119fdf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -31,16 +31,20 @@ if (len(ids)==0): for id in ids: print(id) - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(dir+"/"+id+".fa","w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 6 Jan 2021 02:33:35 -0600 Subject: Pubseq fetch: sometimes a request times out. So repeat with intervals. --- workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'workflows/tools') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 2119fdf..ef4edde 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -5,6 +5,7 @@ import json import os import requests import sys +import time parser = argparse.ArgumentParser(description=""" @@ -33,18 +34,22 @@ for id in ids: print(id) jsonfn = dir+"/"+id+".json" if not os.path.exists(jsonfn): + count = 0 r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fastafn = dir+"/"+id+".fa" - if os.path.exists(fastafn): continue - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(fastafn,"w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + -- cgit v1.2.3