aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2021-01-07 23:50:01 +0100
committerAndreaGuarracino2021-01-07 23:50:01 +0100
commit4d841d279b2bf73da2ba815d53863c7f2861c956 (patch)
tree83b9ad136dabacbf7ed54e19b2db6df348bef904
parent141e619929cee17018417d71111063015e73c366 (diff)
parentc080c3cffedcc0cc99496b5e70fcfdf998978f16 (diff)
downloadbh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.gz
bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.tar.lz
bh20-seq-resource-4d841d279b2bf73da2ba815d53863c7f2861c956.zip
Merge branch 'master' into yamlfa2ttl
-rw-r--r--bh20simplewebuploader/api.py116
-rw-r--r--bh20simplewebuploader/main.py26
-rw-r--r--bh20simplewebuploader/static/main.css1
-rw-r--r--bh20simplewebuploader/templates/home.html71
-rw-r--r--example/esr_example.yaml4
-rw-r--r--example/uthsc_example.yaml2
-rw-r--r--test/rest-api.org29
-rw-r--r--workflows/pull-data/genbank/README.md12
-rwxr-xr-xworkflows/pull-data/genbank/genbank-fetch-ids.py39
-rw-r--r--workflows/pull-data/genbank/genbank.py3
-rwxr-xr-xworkflows/pull-data/genbank/transform-genbank-xml2yamlfa.py78
-rwxr-xr-xworkflows/pull-data/genbank/update-from-genbank.py25
-rw-r--r--workflows/pull-data/genbank/utils.py22
-rwxr-xr-xworkflows/tools/pubseq-fetch-data.py55
-rwxr-xr-xworkflows/tools/pubseq-fetch-ids (renamed from workflows/tools/sparql-fetch-ids)2
15 files changed, 348 insertions, 137 deletions
diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py
index b1b505f..761ad03 100644
--- a/bh20simplewebuploader/api.py
+++ b/bh20simplewebuploader/api.py
@@ -3,20 +3,26 @@
import os
import requests
import sys
+import types
from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
from bh20simplewebuploader.main import app, sparqlURL
+PUBSEQ="http://covid19.genenetwork.org"
+ARVADOS="https://collections.lugli.arvadosapi.com/c="
+
# Helper functions
-def fetch_sample_metadata(id):
- query = """
+def fetch_sample(id, query=None):
+ default_query = """
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
PREFIX sio: <http://semanticscience.org/resource/>
PREFIX edam: <http://edamontology.org/>
PREFIX efo: <http://www.ebi.ac.uk/efo/>
PREFIX evs: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
+
select distinct ?id ?seq ?date ?info ?specimen ?sequencer ?mapper
{
?sample sio:SIO_000115 "%s" ;
@@ -24,15 +30,49 @@ def fetch_sample_metadata(id):
evs:C25164 ?date .
?seq pubseq:technology ?tech ;
pubseq:sample ?sample .
- ?tech efo:EFO_0002699 ?mapper ;
- obo:OBI_0600047 ?sequencer .
+ optional { ?tech efo:EFO_0002699 ?mapper } .
+ optional { ?tech obo:OBI_0600047 ?sequencer . }
optional { ?sample edam:data_2091 ?info } .
optional { ?sample obo:OBI_0001479 ?specimen } .
} limit 5
+
""" % id
+ if not query: query = default_query
+ print(query)
payload = {'query': query, 'format': 'json'}
r = requests.get(sparqlURL, params=payload)
- return r.json()['results']['bindings']
+ res = r.json()
+ print(res)
+ return res['results']['bindings'],res['head']['vars']
+
+def fetch_one_sample(id, query=None):
+ """Get the top sample and return a SimpleNamespace"""
+
+ result,varlist = fetch_sample(id,query)
+ h = {}
+ row = result[0]
+ for key in varlist:
+ if key in row:
+ h[key] = row[key]['value']
+ print(h)
+ h['arv_id'] = os.path.basename(h['seq'])
+ return types.SimpleNamespace(**h)
+
+def fetch_one_record(id):
+ m = fetch_one_sample(id)
+ arv_id = m.arv_id
+ rec = { "id": id,
+ 'arv_id': arv_id,
+ "permalink": PUBSEQ+'/resource/'+id,
+ "collection": m.seq,
+ 'collection_date': m.date,
+ 'fasta': ARVADOS+arv_id+'/sequence.fasta',
+ 'metadata': ARVADOS+arv_id+'/metadata.yaml',
+ }
+ h = m.__dict__ # for optional items
+ if 'mapper' in h: rec['mapper'] = m.mapper
+ if 'sequencer' in h: rec['sequencer']= m.sequencer
+ return rec
# Main API routes
@@ -42,54 +82,40 @@ def version():
@app.route('/api/sample/<id>.json')
def sample(id):
- # metadata = file.name(seq)+"/metadata.yaml"
- meta = fetch_sample_metadata(id)
- print(meta)
- return jsonify([{
- 'id': x['id']['value'],
- 'fasta': x['seq']['value'],
- 'collection': os.path.dirname(x['seq']['value']),
- 'date': x['date']['value'],
- 'info': x['info']['value'],
- 'specimen': x['specimen']['value'],
- 'sequencer': x['sequencer']['value'],
- 'mapper': x['mapper']['value'],
- } for x in meta])
+ """
+
+API sample should return a record pointing to other resources,
+notably: permalink, original metadata record and the fasta
+data.
+
+curl http://localhost:5067/api/sample/MT533203.1.json
+{
+ "id": "MT533203.1",
+ "permalink": "http://covid19.genenetwork.org/resource/MT533203.1",
+ "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+ "collection_date": "2020-04-27",
+ "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta",
+ "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml",
+ "mapper": "minimap v. 2.17",
+ "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632"
+}
+
+"""
+
+ return jsonify([fetch_one_record(id)])
@app.route('/api/ebi/sample-<id>.xml', methods=['GET'])
def ebi_sample(id):
- meta = fetch_sample_metadata(id)[0]
+ meta,varlist = fetch_sample(id)[0]
page = render_template('ebi-sample.xml',sampleid=id,sequencer=meta['sequencer']['value'],date=meta['date']['value'],specimen=meta['specimen']['value'])
return page
@app.route('/api/search', methods=['GET'])
def search():
"""
- Execute a 'global search'
+ Execute a 'global search'. Currently just duplicates fetch one
+ sample. Should be more flexible FIXME.
"""
s = request.args.get('s')
- if s == "":
- s = "MT326090.1"
- query = """
- PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
- PREFIX sio: <http://semanticscience.org/resource/>
- PREFIX edam: <http://edamontology.org/>
- select distinct ?id ?seq ?info
- {
- ?sample sio:SIO_000115 "%s" .
- ?sample sio:SIO_000115 ?id .
- ?seq pubseq:sample ?sample .
- ?sample edam:data_2091 ?info .
- } limit 100
- """ % s
- payload = {'query': query, 'format': 'json'}
- r = requests.get(sparqlURL, params=payload)
- result = r.json()['results']['bindings']
- # metadata = file.name(seq)+"/metadata.yaml"
- print(result)
- return jsonify([{
- 'id': x['id']['value'],
- 'fasta': x['seq']['value'],
- 'collection': os.path.dirname(x['seq']['value']),
- 'info': x['info']['value'],
- } for x in result])
+ if s == "": s = "MT326090.1"
+ return jsonify([fetch_one_record(s)])
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index b620946..504f03c 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -34,6 +34,7 @@ if not os.path.isfile('bh20sequploader/main.py'):
print("WARNING: run FLASK from the root of the source repository!", file=sys.stderr)
app = Flask(__name__, static_url_path='/static', static_folder='static')
+app.config['JSON_SORT_KEYS'] = False
# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes.
# We will enforce the limit ourselves and set a higher safety limit here.
@@ -252,7 +253,7 @@ FORM_ITEMS = load_schema_generate_form()
def get_feed_items(name, start=0, stop=9):
redis_client = redis.Redis(host=os.environ.get('HOST', 'localhost'),
port=os.environ.get('PORT', 6379),
- db=os.environ.get('REDIS_DB', 0))
+ db=os.environ.get('REDIS_DB', 0))
feed_items = []
try:
for el in redis_client.zrevrange(name, start, stop):
@@ -272,12 +273,23 @@ def send_home():
"""
Send the front page.
"""
+ (tweets,
+ commits,
+ pubmed_articles,
+ arxiv_articles) = [get_feed_items(x) for x in ["bh20-tweet-score:",
+ "bh20-commit-score:",
+ "bh20-pubmed-score:",
+ "bh20-arxiv-score:"]]
return render_template(
'home.html', menu='HOME',
- tweets=get_feed_items("bh20-tweet-score:"),
- commits=get_feed_items("bh20-commit-score:"),
- pubmed_articles=get_feed_items("bh20-pubmed-score:"),
- arxiv_articles=get_feed_items("bh20-arxiv-score:"),
+ all_items=list(itertools.chain(tweets,
+ commits,
+ pubmed_articles,
+ arxiv_articles)),
+ tweets=tweets,
+ commits=commits,
+ pubmed_articles=pubmed_articles,
+ arxiv_articles=arxiv_articles,
load_map=True)
@@ -750,8 +762,8 @@ union
# http://covid19.genenetwork.org/resource/lugli-4zz18-gx0ifousk9yu0ql
m = re.match(r"http://collections.lugli.arvadosapi.com/c=([^/]*)/sequence.fasta|http://covid19.genenetwork.org/resource/(.*)", sequenceuri)
collection = m.group(1) or m.group(2)
- fastauri = f"http://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta"
- metauri = f"http://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml"
+ fastauri = f"https://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta"
+ metauri = f"https://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml"
locationuri=sample['geo']['value']
location=sample['geoname']['value']
date=sample['date']['value']
diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css
index fbc721e..e2f0c83 100644
--- a/bh20simplewebuploader/static/main.css
+++ b/bh20simplewebuploader/static/main.css
@@ -567,6 +567,7 @@ input[name="feed-tabs"] ~ .tab {
display: none;
}
+#tab-all-items:checked ~ .tab.content-all-items,
#tab-pubmed-articles:checked ~ .tab.content-pubmed-articles,
#tab-arxiv-articles:checked ~ .tab.content-arxiv-articles,
#tab-tweets:checked ~ .tab.content-tweets,
diff --git a/bh20simplewebuploader/templates/home.html b/bh20simplewebuploader/templates/home.html
index a880f81..23f48bf 100644
--- a/bh20simplewebuploader/templates/home.html
+++ b/bh20simplewebuploader/templates/home.html
@@ -29,7 +29,9 @@
</div>
<div id="feed">
- <input name="feed-tabs" type="radio" id="tab-arxiv-articles" checked/>
+ <input name="feed-tabs" type="radio" id="tab-all-items" checked/>
+ <label for="tab-all-items">All Items</label>
+ <input name="feed-tabs" type="radio" id="tab-arxiv-articles"/>
<label for="tab-arxiv-articles">Arxiv</label>
<input name="feed-tabs" type="radio" id="tab-pubmed-articles"/>
<label for="tab-pubmed-articles">Pubmed</label>
@@ -37,6 +39,73 @@
<label for="tab-tweets">Tweets</label>
<input name="feed-tabs" type="radio" id="tab-commits"/>
<label for="tab-commits">Commits</label>
+ <ul class="tab content-all-items">
+ <!-- Begin News -->
+ {% if all_items %}
+ {% for item in all_items|sort(reverse=true, attribute="score")%}
+ <li>
+ {% if item['authors'] %}
+ <!-- Arxiv article -->
+ <p>
+ <b>[arxiv]</b>
+ <a href="{{ item['url'] }}" target="_blank">
+ {{item['title']}}
+ </a>
+ <br/>
+ <b>Authors:</b> {{ item['authors'] }}
+ <br/>
+ <b>Abstract:</b> {{ item['abstract']}}...
+ <br/>
+ <b>Submitted:</b> {{ item['submission']}}
+ </p>
+
+ {% elif item['full-authors'] %}
+ <!-- Pubmed Article -->
+ <p><b>[Pubmed]:</b>
+ <a href="https://pubmed.ncbi.nlm.nih.gov/{{ item['docsum-pmid'] }}" target="_blank"><b>Summary:</b>
+ {{ item['summary'] }}
+ </a> <br/>
+ <b>Full Authors:</b> {{ item['full-authors'] }} <br/>
+ <b>Short Authors:</b> {{ item['short-authors'] }} <br/>
+ <b>Citation:</b> {{ item['citation'] }} <br/>
+ <b>Short Journal Citation:</b> {{ item['short-journal-citation'] }} <br/>
+ </p>
+
+ {% elif item['tweet'] %}
+ <!-- Tweets -->
+ <p>
+ <b>[Tweet]:</b>
+ {{ item['tweet']|urlize(40, target="_blank")}}
+ <small>
+ <a href="{{ item['url'] }}" target="_blank">source</a></small>
+ <br/>
+ by {{ item['author'] }}
+ <br/>
+ <small>{{ item['timeposted'] }}</small>
+ </p>
+
+ {% elif item['repository-url'] %}
+ <!-- Commits -->
+ <p>
+ <b>[Commit]:</b>
+ <a href="{{ item.url }}" target="_blank">
+ {{ item.hash.split(":")[-1][:7] }}: {{ item.content }}
+ </a>
+ <br/>
+ <small>
+ <a href="{{ item['repository-url'] }}" target="_blank"> {{ item.author }}/{{ item.repository }}</a>
+ on {{ item.timeposted }}
+ </small>
+ </p>
+ {% endif %}
+ </li>
+ {%endfor%}
+
+ {% else %}
+ There are no items to display :(
+ {% endif %}
+ <!-- End News -->
+ </ul>
<ul class="tab content-arxiv-articles">
{% if arxiv_articles %}
{% for article in arxiv_articles|sort(reverse=true, attribute="score")%}
diff --git a/example/esr_example.yaml b/example/esr_example.yaml
index c97d0bf..c7bdb30 100644
--- a/example/esr_example.yaml
+++ b/example/esr_example.yaml
@@ -15,7 +15,7 @@ sample:
collection_date: "2020-02-26"
collection_location: https://www.wikidata.org/wiki/Q37100
specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
- source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] ??
+ source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
additional_collection_information: Optional free text field for additional information
virus:
@@ -23,7 +23,7 @@ virus:
virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
technology:
- sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION
alignment_protocol: https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics
assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
additional_technology_information: "Artic V3 workflow"
diff --git a/example/uthsc_example.yaml b/example/uthsc_example.yaml
index 661cf60..589a7a5 100644
--- a/example/uthsc_example.yaml
+++ b/example/uthsc_example.yaml
@@ -23,7 +23,7 @@ virus:
virus_strain: SARS-CoV-2/human/USA/AL_UT14/2020
technology:
- sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION
alignment_protocol: guppy
assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
additional_technology_information: Optional free text field for additional information
diff --git a/test/rest-api.org b/test/rest-api.org
index 66639c3..2ea2b11 100644
--- a/test/rest-api.org
+++ b/test/rest-api.org
@@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version
}
#+end_src
+The current API can fetch data
+
+#+begin_src js
+curl http://covid19.genenetwork.org/api/search?s=MT533203.1
+[
+ {
+ "collection": "http://covid19.genenetwork.org/resource",
+ "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+ "id": "MT533203.1",
+ "info": "http://identifiers.org/insdc/MT533203.1#sequence"
+ }
+]
+
+curl http://covid19.genenetwork.org/api/sample/MT533203.1.json
+[
+ {
+ "collection": "http://covid19.genenetwork.org/resource",
+ "date": "2020-04-27",
+ "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+ "id": "MT533203.1",
+ "info": "http://identifiers.org/insdc/MT533203.1#sequence",
+ "mapper": "minimap v. 2.17",
+ "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632",
+ "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831"
+ }
+]
+#+end_src
+
+
The Python3 version is
#+begin_src python :session :exports both
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index 5464d1d..188ff6f 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq
```sh
# --- get list of IDs already in PubSeq
-../../tools/sparql-fetch-ids > pubseq_ids.txt
+../../tools/pubseq-fetch-ids > pubseq_ids.txt
+
# --- get list of missing genbank IDs
python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
@@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s
```
+## Validate GenBank data
+
+To pull the data from PubSeq use the list of pubseq ids generated
+above.
+
+
+
# TODO
-- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
+- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index 1962daa..e9e7315 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -6,28 +6,19 @@
#
# See also directory .guix-run and README.md
-BATCH_SIZE=5000
-
import argparse
-import json
-import os
-import requests
import sys
-import xml.etree.ElementTree as ET
-from datetime import date, datetime
-from dateutil.parser import parse
+
+from Bio import Entrez
parser = argparse.ArgumentParser()
parser.add_argument('--max', type=int, help='Max queries', required=False)
parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False)
args = parser.parse_args()
-from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-
-# min_acceptable_collection_date = datetime(2019, 12, 1)
+BATCH_SIZE = 5000
-today_date = date.today().strftime("%Y.%m.%d")
+Entrez.email = 'another_email@gmail.com' # FIXME
skip = set()
if args.skip:
@@ -36,10 +27,11 @@ if args.skip:
for line in content:
skip.add(line.strip())
-print(f"Skip size is {len(skip)}",file=sys.stderr)
+print(f"Skip size is {len(skip)}", file=sys.stderr)
# Try to search several strings
TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
+
# Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with
PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']
@@ -47,22 +39,27 @@ ids = set()
for term in TERMS:
num_read = BATCH_SIZE
retstart = 0
+
while num_read == BATCH_SIZE:
record = Entrez.read(
- Entrez.esearch(db='nuccore', term=term, idtype='acc',
- retstart=retstart, retmax=BATCH_SIZE)
+ Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE)
)
+
idlist = record['IdList']
new_ids = set(idlist)
num_read = len(new_ids)
- print(num_read,":",idlist[0],file=sys.stderr)
retstart += num_read
- new_ids.difference_update(skip) # remove skip ids
+
+ print(num_read, ":", idlist[0], file=sys.stderr)
+
+ new_ids.difference_update(skip) # remove skip ids
new_ids = set([id for id in new_ids if id[:2] not in PREFIX])
- ids.update(new_ids) # add to total set
- print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr)
+ ids.update(new_ids) # add to total set
+
+ print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr)
+
if args.max and len(ids) > args.max:
- print(f"Stopping past #{args.max} items",file=sys.stderr)
+ print(f"Stopping past #{args.max} items", file=sys.stderr)
break
for id in ids:
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 85d615c..026c03f 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -111,7 +111,8 @@ def get_metadata(id, gbseq):
# print(n,file=sys.stderr)
if n != 'Unpublished':
institute,address = n.split(',',1)
- submitter.submitter_name = institute.split(') ')[1]
+ if ")" in institute:
+ submitter.submitter_name = institute.split(')')[1]
submitter.submitter_address = address.strip()
except AttributeError:
pass
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index 9414864..1a8035d 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -33,43 +33,47 @@ states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
- with gzip.open(xmlfn, 'r') as f:
- xml = f.read().decode()
- tree = ET.fromstring(xml)
- for gb in tree.findall('./GBSeq'):
- valid = None
- error = None
- meta = {}
- id = gb.find("GBSeq_locus").text
- basename = dir+"/"+id
- print(f" parsing {id}")
- try:
- valid,meta = genbank.get_metadata(id,gb)
- if valid:
- # --- write JSON
- jsonfn = basename + ".json"
- with open(jsonfn, 'w') as outfile:
- print(f" writing {jsonfn}")
- json.dump(meta, outfile, indent=4)
- # --- write FASTA
- fa = basename+".fa"
- seq = genbank.get_sequence(id,gb)
- print(f" writing {fa}")
- with open(fa,"w") as f2:
- f2.write(f"> {id}\n")
- f2.write(seq)
- # print(seq)
- except genbank.GBError as e:
- error = f"{e} for {id}"
- print(error,file=sys.stderr)
- valid = False
- state = {}
- state['valid'] = valid
- if error:
- state['error'] = error
- if meta['warnings']:
- state['warnings'] = meta['warnings']
- states[id] = state
+ try:
+ with gzip.open(xmlfn, 'r') as f:
+ xml = f.read().decode()
+ except Exception:
+ with open(xmlfn, 'r') as f:
+ xml = f.read()
+ tree = ET.fromstring(xml)
+ for gb in tree.findall('./GBSeq'):
+ valid = None
+ error = None
+ meta = {}
+ id = gb.find("GBSeq_locus").text
+ basename = dir+"/"+id
+ print(f" parsing {id}")
+ try:
+ valid,meta = genbank.get_metadata(id,gb)
+ if valid:
+ # --- write JSON
+ jsonfn = basename + ".json"
+ with open(jsonfn, 'w') as outfile:
+ print(f" writing {jsonfn}")
+ json.dump(meta, outfile, indent=4)
+ # --- write FASTA
+ fa = basename+".fa"
+ seq = genbank.get_sequence(id,gb)
+ print(f" writing {fa}")
+ with open(fa,"w") as f2:
+ f2.write(f"> {id}\n")
+ f2.write(seq)
+ # print(seq)
+ except genbank.GBError as e:
+ error = f"{e} for {id}"
+ print(error,file=sys.stderr)
+ valid = False
+ state = {}
+ state['valid'] = valid
+ if error:
+ state['error'] = error
+ if meta['warnings']:
+ state['warnings'] = meta['warnings']
+ states[id] = state
statefn = dir + '/state.json'
with open(statefn, 'w') as outfile:
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index dca5563..95f5a93 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -14,22 +14,21 @@ import sys
from utils import chunks
from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-BATCH=100
+Entrez.email = 'another_email@gmail.com' # FIXME
+
+BATCH = 100
parser = argparse.ArgumentParser()
-parser.add_argument('--max', type=int, help='Max queries', required=False)
parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
parser.add_argument('--out', type=str, help='Directory to write to', required=True)
+parser.add_argument('--max', type=int, help='Max queries', required=False)
args = parser.parse_args()
ids = set()
with open(args.ids) as f:
- content = f.readlines()
- for line in content:
- ids.add(line.strip())
+ ids.update([line.strip() for line in f])
dir = args.out
if not os.path.exists(dir):
@@ -37,12 +36,14 @@ if not os.path.exists(dir):
request_num = BATCH
if args.max:
- request_num = min(BATCH,args.max)
+ request_num = min(BATCH, args.max)
+
+for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
+ xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
+ print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)
-for i, idsx in enumerate(chunks(list(ids), request_num)):
- xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
- print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
with gzip.open(xmlfn, 'w') as f:
- f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
- if args.max and i*request_num >= args.max:
+ f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())
+
+ if args.max and num_chunk * request_num >= args.max:
break
diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py
index 3efc67a..96920a5 100644
--- a/workflows/pull-data/genbank/utils.py
+++ b/workflows/pull-data/genbank/utils.py
@@ -1,5 +1,6 @@
import os
+
def is_integer(string_to_check):
try:
int(string_to_check)
@@ -7,19 +8,26 @@ def is_integer(string_to_check):
except ValueError:
return False
+
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
+
def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
- # Check duplicated entry looking at all dictionaries
+ """
+ Check duplicated entry by looking in all dictionaries
+ """
+
field_to_term_to_uri_dict = {}
- path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
- os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+ path_dict_xxx_csv_list = [
+ os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+ os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
+ ]
for path_dict_xxx_csv in path_dict_xxx_csv_list:
- print('Read {}'.format(path_dict_xxx_csv))
+ print(f'Read {path_dict_xxx_csv}')
with open(path_dict_xxx_csv) as f:
for line in f:
@@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
term = term.strip('"')
if term in field_to_term_to_uri_dict:
- print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+ print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
continue
field_to_term_to_uri_dict[term] = uri
@@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
term = term.strip('"')
if term in field_to_term_to_uri_dict[field]:
- print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+ print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
continue
field_to_term_to_uri_dict[field][term] = uri
- return field_to_term_to_uri_dict \ No newline at end of file
+ return field_to_term_to_uri_dict
diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
new file mode 100755
index 0000000..ef4edde
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import sys
+import time
+
+parser = argparse.ArgumentParser(description="""
+
+Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs
+can be passed in on the command line or in a file.
+
+""")
+parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
+parser.add_argument('--out', type=str, help='Directory to write to',
+required=True)
+parser.add_argument('--ids', type=str, help='File with ids', required=False)
+parser.add_argument('id', nargs='*', help='id(s)')
+args = parser.parse_args()
+
+dir = args.out
+if not os.path.exists(dir):
+ raise Exception(f"Directory {dir} does not exist")
+
+ids = args.id
+if (len(ids)==0):
+ print(f"Reading {args.ids}")
+ with open(args.ids) as f:
+ ids = [ l.strip() for l in f.readlines() ]
+
+for id in ids:
+ print(id)
+ jsonfn = dir+"/"+id+".json"
+ if not os.path.exists(jsonfn):
+ count = 0
+ r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+ while not r:
+ count += 1
+ if count>10: raise Exception(f"Can not find record for {id}")
+ time.sleep(15)
+ r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+ m_url = r.json()[0]['metadata']
+ mr = requests.get(m_url)
+ with open(dir+"/"+id+".json","w") as outf:
+ outf.write(mr.text)
+ if args.fasta:
+ fastafn = dir+"/"+id+".fa"
+ if os.path.exists(fastafn): continue
+ fa_url = r.json()[0]['fasta']
+ fr = requests.get(fa_url)
+ with open(fastafn,"w") as outf:
+ outf.write(fr.text)
+
diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/pubseq-fetch-ids
index 19b2d82..f5920ec 100755
--- a/workflows/tools/sparql-fetch-ids
+++ b/workflows/tools/pubseq-fetch-ids
@@ -2,7 +2,7 @@
#
# Use a SPARQL query to fetch all IDs in the PubSeq database
#
-# sparql-fetch-ids > pubseq_ids.txt
+# pubseq-fetch-ids > pubseq_ids.txt
#
# Note: requires Ruby 3.x. Older Ruby gives a syntax error