From 3c09a92423408d01b64e1b842c6b96778939d098 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:11:13 +0000 Subject: Rename --- workflows/tools/pubseq-fetch-ids | 67 ++++++++++++++++++++++++++++++++++++++++ workflows/tools/sparql-fetch-ids | 67 ---------------------------------------- 2 files changed, 67 insertions(+), 67 deletions(-) create mode 100755 workflows/tools/pubseq-fetch-ids delete mode 100755 workflows/tools/sparql-fetch-ids (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids new file mode 100755 index 0000000..19b2d82 --- /dev/null +++ b/workflows/tools/pubseq-fetch-ids @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# +# Use a SPARQL query to fetch all IDs in the PubSeq database +# +# sparql-fetch-ids > pubseq_ids.txt +# +# Note: requires Ruby 3.x. Older Ruby gives a syntax error + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' +require 'pp' + +MAX=5_000 + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform +# lambda when passed in +def sparql query, transform = nil + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body,symbolize_names: true) + data => { head: { vars: }, results: { bindings: results} } + vars = vars.map { |v| v.to_sym } + results.map { |rec| + # return results after transforming to a Hash and applying the + # optional transform lambda. Note the transform can not only + # reduce results, or create an array, but also may transform into + # an OpenStruct. + res = {} + vars.each { |name| res[name] = rec[name][:value] } + if transform + transform.call(res) + else + res + end + } +end + +start = 0 +num = MAX +begin + query = " +SELECT DISTINCT ?id +FROM +WHERE { + + ?arvid ?id . + +} LIMIT #{num} OFFSET #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress + start += num +end while list.size == MAX diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/sparql-fetch-ids deleted file mode 100755 index 19b2d82..0000000 --- a/workflows/tools/sparql-fetch-ids +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env ruby -# -# Use a SPARQL query to fetch all IDs in the PubSeq database -# -# sparql-fetch-ids > pubseq_ids.txt -# -# Note: requires Ruby 3.x. Older Ruby gives a syntax error - -require 'net/http' -require 'json' -require 'ostruct' -require 'erb' -require 'pp' - -MAX=5_000 - -SPARQL_HEADER=" -prefix rdfs: -prefix rdf: -prefix dc: -prefix schema: -PREFIX pubseq: -" - -# Build a SPARQL query, submit and return results. Apply transform -# lambda when passed in -def sparql query, transform = nil - api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" - - response = Net::HTTP.get_response(URI.parse(api_url)) - data = JSON.parse(response.body,symbolize_names: true) - data => { head: { vars: }, results: { bindings: results} } - vars = vars.map { |v| v.to_sym } - results.map { |rec| - # return results after transforming to a Hash and applying the - # optional transform lambda. Note the transform can not only - # reduce results, or create an array, but also may transform into - # an OpenStruct. - res = {} - vars.each { |name| res[name] = rec[name][:value] } - if transform - transform.call(res) - else - res - end - } -end - -start = 0 -num = MAX -begin - query = " -SELECT DISTINCT ?id -FROM -WHERE { - - ?arvid ?id . - -} LIMIT #{num} OFFSET #{start} -" - list = sparql(query, lambda { |rec| rec[:id] }) - list.each do | l | - print(l,"\n") - end - $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress - start += num -end while list.size == MAX -- cgit v1.2.3 From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 09:34:26 +0000 Subject: API: fix returned record to include original metadata --- bh20simplewebuploader/api.py | 34 ++++++++++++++++++++++++++++++++-- test/rest-api.org | 29 +++++++++++++++++++++++++++++ workflows/pull-data/genbank/README.md | 12 ++++++++++-- workflows/tools/pubseq-fetch-ids | 2 +- 4 files changed, 72 insertions(+), 5 deletions(-) (limited to 'workflows') diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index b1b505f..11c74f2 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -7,6 +7,9 @@ import sys from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL +PUBSEQ="http://covid19.genenetwork.org" +ARVADOS="https://collections.lugli.arvadosapi.com/c=" + # Helper functions def fetch_sample_metadata(id): @@ -42,13 +45,40 @@ def version(): @app.route('/api/sample/.json') def sample(id): + """ + +API sample should return a record pointing to other resources, +notably: permalink, original metadata record and the fasta +data. + +curl http://localhost:5067/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] + + +""" # metadata = file.name(seq)+"/metadata.yaml" meta = fetch_sample_metadata(id) print(meta) + # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml return jsonify([{ 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), + 'collection': x['seq']['value'], + 'permalink': PUBSEQ+'/resource/'+x['id']['value'], + 'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta', + 'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml', 'date': x['date']['value'], 'info': x['info']['value'], 'specimen': x['specimen']['value'], diff --git a/test/rest-api.org b/test/rest-api.org index 66639c3..2ea2b11 100644 --- a/test/rest-api.org +++ b/test/rest-api.org @@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version } #+end_src +The current API can fetch data + +#+begin_src js +curl http://covid19.genenetwork.org/api/search?s=MT533203.1 +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence" + } +] + +curl http://covid19.genenetwork.org/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "date": "2020-04-27", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] +#+end_src + + The Python3 version is #+begin_src python :session :exports both diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/pubseq-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error -- cgit v1.2.3 From 1187fa716cacde2b50566b67b5d619b8f12894f9 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:07:39 +0000 Subject: fetches original metadata from PubSeq/Arvados --- workflows/tools/pubseq-fetch-data.py | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 workflows/tools/pubseq-fetch-data.py (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..c22d754 --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids[0:2]: + print(id) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + meta = mr.json() + with open(dir+"/"+id+".json","w") as outf: + json.dump(meta, outf, indent=4) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 2ceeccd5e5158362548b868390e9d411f73cd9ff Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:29:41 +0000 Subject: fetch: do a straight dump of the original record --- workflows/tools/pubseq-fetch-data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index c22d754..3f5e6cf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -28,14 +28,13 @@ if (len(ids)==0): with open(args.ids) as f: ids = [ l.strip() for l in f.readlines() ] -for id in ids[0:2]: +for id in ids: print(id) r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") if r: m_url = r.json()[0]['metadata'] mr = requests.get(m_url) - meta = mr.json() with open(dir+"/"+id+".json","w") as outf: - json.dump(meta, outf, indent=4) + outf.write(mr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From ced9613aa1c18c6a68056d1898b69865beac9ac2 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 12:35:05 +0000 Subject: Add option for fetching fasta --- workflows/tools/pubseq-fetch-data.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 3f5e6cf..23c4dea 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -12,6 +12,7 @@ Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs can be passed in on the command line or in a file. """) +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') parser.add_argument('--out', type=str, help='Directory to write to', required=True) parser.add_argument('--ids', type=str, help='File with ids', required=False) @@ -36,5 +37,10 @@ for id in ids: mr = requests.get(m_url) with open(dir+"/"+id+".json","w") as outf: outf.write(mr.text) + if args.fasta: + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(dir+"/"+id+".fa","w") as outf: + outf.write(fr.text) else: raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From 17cd8caa85991784f205109f2b64b255726a0e80 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 07:13:15 -0600 Subject: Fetching fixes --- workflows/tools/pubseq-fetch-data.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 23c4dea..2119fdf 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -31,16 +31,20 @@ if (len(ids)==0): for id in ids: print(id) - r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(dir+"/"+id+".fa","w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + if r: + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + else: + raise Exception(f"Can not find record for {id}") -- cgit v1.2.3 From f7666a7766c8138aa690340fc68cb67f709327f3 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 5 Jan 2021 17:35:46 +0100 Subject: cleaning genbank-fetch-ids.py --- workflows/pull-data/genbank/genbank-fetch-ids.py | 40 +++++++++++------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'workflows') diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index 1962daa..cb48cd8 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -6,28 +6,20 @@ # # See also directory .guix-run and README.md -BATCH_SIZE=5000 - import argparse -import json -import os -import requests import sys -import xml.etree.ElementTree as ET -from datetime import date, datetime -from dateutil.parser import parse +from datetime import date + +from Bio import Entrez parser = argparse.ArgumentParser() parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False) args = parser.parse_args() -from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME - -# min_acceptable_collection_date = datetime(2019, 12, 1) +BATCH_SIZE = 5000 -today_date = date.today().strftime("%Y.%m.%d") +Entrez.email = 'another_email@gmail.com' # FIXME skip = set() if args.skip: @@ -36,10 +28,11 @@ if args.skip: for line in content: skip.add(line.strip()) -print(f"Skip size is {len(skip)}",file=sys.stderr) +print(f"Skip size is {len(skip)}", file=sys.stderr) # Try to search several strings TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP'] @@ -47,22 +40,27 @@ ids = set() for term in TERMS: num_read = BATCH_SIZE retstart = 0 + while num_read == BATCH_SIZE: record = Entrez.read( - Entrez.esearch(db='nuccore', term=term, idtype='acc', - retstart=retstart, retmax=BATCH_SIZE) + Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE) ) + idlist = record['IdList'] new_ids = set(idlist) num_read = len(new_ids) - print(num_read,":",idlist[0],file=sys.stderr) retstart += num_read - new_ids.difference_update(skip) # remove skip ids + + print(num_read, ":", idlist[0], file=sys.stderr) + + new_ids.difference_update(skip) # remove skip ids new_ids = set([id for id in new_ids if id[:2] not in PREFIX]) - ids.update(new_ids) # add to total set - print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr) + ids.update(new_ids) # add to total set + + print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr) + if args.max and len(ids) > args.max: - print(f"Stopping past #{args.max} items",file=sys.stderr) + print(f"Stopping past #{args.max} items", file=sys.stderr) break for id in ids: -- cgit v1.2.3 From 911ba372cfc4b35c5b52d18a573a636ea78d16d7 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 5 Jan 2021 17:56:19 +0100 Subject: cleaning update-from-genbank.py; removed unused import from genbank-fetch-ids.py --- workflows/pull-data/genbank/genbank-fetch-ids.py | 1 - workflows/pull-data/genbank/update-from-genbank.py | 25 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'workflows') diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index cb48cd8..e9e7315 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -8,7 +8,6 @@ import argparse import sys -from datetime import date from Bio import Entrez diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break -- cgit v1.2.3 From c31835f787f3ae36e26bad0a1803f8557f8084e7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 6 Jan 2021 02:33:35 -0600 Subject: Pubseq fetch: sometimes a request times out. So repeat with intervals. --- workflows/tools/pubseq-fetch-data.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'workflows') diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py index 2119fdf..ef4edde 100755 --- a/workflows/tools/pubseq-fetch-data.py +++ b/workflows/tools/pubseq-fetch-data.py @@ -5,6 +5,7 @@ import json import os import requests import sys +import time parser = argparse.ArgumentParser(description=""" @@ -33,18 +34,22 @@ for id in ids: print(id) jsonfn = dir+"/"+id+".json" if not os.path.exists(jsonfn): + count = 0 r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") - if r: - m_url = r.json()[0]['metadata'] - mr = requests.get(m_url) - with open(dir+"/"+id+".json","w") as outf: - outf.write(mr.text) - if args.fasta: - fastafn = dir+"/"+id+".fa" - if os.path.exists(fastafn): continue - fa_url = r.json()[0]['fasta'] - fr = requests.get(fa_url) - with open(fastafn,"w") as outf: - outf.write(fr.text) - else: - raise Exception(f"Can not find record for {id}") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + -- cgit v1.2.3 From 27a2b926036211469eccbf8c3d9580182482bdc2 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 6 Jan 2021 14:13:13 +0100 Subject: cleaned utils.py --- workflows/pull-data/genbank/utils.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'workflows') diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py index 3efc67a..96920a5 100644 --- a/workflows/pull-data/genbank/utils.py +++ b/workflows/pull-data/genbank/utils.py @@ -1,5 +1,6 @@ import os + def is_integer(string_to_check): try: int(string_to_check) @@ -7,19 +8,26 @@ def is_integer(string_to_check): except ValueError: return False + def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] + def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): - # Check duplicated entry looking at all dictionaries + """ + Check duplicated entry by looking in all dictionaries + """ + field_to_term_to_uri_dict = {} - path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + path_dict_xxx_csv_list = [ + os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv') + ] for path_dict_xxx_csv in path_dict_xxx_csv_list: - print('Read {}'.format(path_dict_xxx_csv)) + print(f'Read {path_dict_xxx_csv}') with open(path_dict_xxx_csv) as f: for line in f: @@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + print(f'Warning: in the dictionaries there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[term] = uri @@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict[field]: - print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[field][term] = uri - return field_to_term_to_uri_dict \ No newline at end of file + return field_to_term_to_uri_dict -- cgit v1.2.3 From 329a1a7e122eda41016185d1b1e8d50d97f8857b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 7 Jan 2021 03:17:47 -0600 Subject: Allow for xml and xml.gz files --- workflows/pull-data/genbank/genbank.py | 3 +- .../genbank/transform-genbank-xml2yamlfa.py | 78 ++++++++++++---------- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'workflows') diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 85d615c..026c03f 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -111,7 +111,8 @@ def get_metadata(id, gbseq): # print(n,file=sys.stderr) if n != 'Unpublished': institute,address = n.split(',',1) - submitter.submitter_name = institute.split(') ')[1] + if ")" in institute: + submitter.submitter_name = institute.split(')')[1] submitter.submitter_address = address.strip() except AttributeError: pass diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index 9414864..1a8035d 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -33,43 +33,47 @@ states = {} for xmlfn in args.files: print(f"--- Reading {xmlfn}") - with gzip.open(xmlfn, 'r') as f: - xml = f.read().decode() - tree = ET.fromstring(xml) - for gb in tree.findall('./GBSeq'): - valid = None - error = None - meta = {} - id = gb.find("GBSeq_locus").text - basename = dir+"/"+id - print(f" parsing {id}") - try: - valid,meta = genbank.get_metadata(id,gb) - if valid: - # --- write JSON - jsonfn = basename + ".json" - with open(jsonfn, 'w') as outfile: - print(f" writing {jsonfn}") - json.dump(meta, outfile, indent=4) - # --- write FASTA - fa = basename+".fa" - seq = genbank.get_sequence(id,gb) - print(f" writing {fa}") - with open(fa,"w") as f2: - f2.write(f"> {id}\n") - f2.write(seq) - # print(seq) - except genbank.GBError as e: - error = f"{e} for {id}" - print(error,file=sys.stderr) - valid = False - state = {} - state['valid'] = valid - if error: - state['error'] = error - if meta['warnings']: - state['warnings'] = meta['warnings'] - states[id] = state + try: + with gzip.open(xmlfn, 'r') as f: + xml = f.read().decode() + except Exception: + with open(xmlfn, 'r') as f: + xml = f.read() + tree = ET.fromstring(xml) + for gb in tree.findall('./GBSeq'): + valid = None + error = None + meta = {} + id = gb.find("GBSeq_locus").text + basename = dir+"/"+id + print(f" parsing {id}") + try: + valid,meta = genbank.get_metadata(id,gb) + if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA + fa = basename+".fa" + seq = genbank.get_sequence(id,gb) + print(f" writing {fa}") + with open(fa,"w") as f2: + f2.write(f"> {id}\n") + f2.write(seq) + # print(seq) + except genbank.GBError as e: + error = f"{e} for {id}" + print(error,file=sys.stderr) + valid = False + state = {} + state['valid'] = valid + if error: + state['error'] = error + if meta['warnings']: + state['warnings'] = meta['warnings'] + states[id] = state statefn = dir + '/state.json' with open(statefn, 'w') as outfile: -- cgit v1.2.3