diff options
-rw-r--r-- | bh20simplewebuploader/api.py | 116 | ||||
-rw-r--r-- | bh20simplewebuploader/main.py | 26 | ||||
-rw-r--r-- | bh20simplewebuploader/static/main.css | 1 | ||||
-rw-r--r-- | bh20simplewebuploader/templates/home.html | 71 | ||||
-rw-r--r-- | example/esr_example.yaml | 4 | ||||
-rw-r--r-- | example/uthsc_example.yaml | 2 | ||||
-rw-r--r-- | test/rest-api.org | 29 | ||||
-rw-r--r-- | workflows/pull-data/genbank/README.md | 12 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/genbank-fetch-ids.py | 39 | ||||
-rw-r--r-- | workflows/pull-data/genbank/genbank.py | 3 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py | 78 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/update-from-genbank.py | 25 | ||||
-rw-r--r-- | workflows/pull-data/genbank/utils.py | 22 | ||||
-rwxr-xr-x | workflows/tools/pubseq-fetch-data.py | 55 | ||||
-rwxr-xr-x | workflows/tools/pubseq-fetch-ids (renamed from workflows/tools/sparql-fetch-ids) | 2 |
15 files changed, 348 insertions, 137 deletions
diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index b1b505f..761ad03 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -3,20 +3,26 @@ import os import requests import sys +import types from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL +PUBSEQ="http://covid19.genenetwork.org" +ARVADOS="https://collections.lugli.arvadosapi.com/c=" + # Helper functions -def fetch_sample_metadata(id): - query = """ +def fetch_sample(id, query=None): + default_query = """ + PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> PREFIX sio: <http://semanticscience.org/resource/> PREFIX edam: <http://edamontology.org/> PREFIX efo: <http://www.ebi.ac.uk/efo/> PREFIX evs: <http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#> PREFIX obo: <http://purl.obolibrary.org/obo/> + select distinct ?id ?seq ?date ?info ?specimen ?sequencer ?mapper { ?sample sio:SIO_000115 "%s" ; @@ -24,15 +30,49 @@ def fetch_sample_metadata(id): evs:C25164 ?date . ?seq pubseq:technology ?tech ; pubseq:sample ?sample . - ?tech efo:EFO_0002699 ?mapper ; - obo:OBI_0600047 ?sequencer . + optional { ?tech efo:EFO_0002699 ?mapper } . + optional { ?tech obo:OBI_0600047 ?sequencer . } optional { ?sample edam:data_2091 ?info } . optional { ?sample obo:OBI_0001479 ?specimen } . } limit 5 + """ % id + if not query: query = default_query + print(query) payload = {'query': query, 'format': 'json'} r = requests.get(sparqlURL, params=payload) - return r.json()['results']['bindings'] + res = r.json() + print(res) + return res['results']['bindings'],res['head']['vars'] + +def fetch_one_sample(id, query=None): + """Get the top sample and return a SimpleNamespace""" + + result,varlist = fetch_sample(id,query) + h = {} + row = result[0] + for key in varlist: + if key in row: + h[key] = row[key]['value'] + print(h) + h['arv_id'] = os.path.basename(h['seq']) + return types.SimpleNamespace(**h) + +def fetch_one_record(id): + m = fetch_one_sample(id) + arv_id = m.arv_id + rec = { "id": id, + 'arv_id': arv_id, + "permalink": PUBSEQ+'/resource/'+id, + "collection": m.seq, + 'collection_date': m.date, + 'fasta': ARVADOS+arv_id+'/sequence.fasta', + 'metadata': ARVADOS+arv_id+'/metadata.yaml', + } + h = m.__dict__ # for optional items + if 'mapper' in h: rec['mapper'] = m.mapper + if 'sequencer' in h: rec['sequencer']= m.sequencer + return rec # Main API routes @@ -42,54 +82,40 @@ def version(): @app.route('/api/sample/<id>.json') def sample(id): - # metadata = file.name(seq)+"/metadata.yaml" - meta = fetch_sample_metadata(id) - print(meta) - return jsonify([{ - 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), - 'date': x['date']['value'], - 'info': x['info']['value'], - 'specimen': x['specimen']['value'], - 'sequencer': x['sequencer']['value'], - 'mapper': x['mapper']['value'], - } for x in meta]) + """ + +API sample should return a record pointing to other resources, +notably: permalink, original metadata record and the fasta +data. + +curl http://localhost:5067/api/sample/MT533203.1.json +{ + "id": "MT533203.1", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "collection_date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632" +} + +""" + + return jsonify([fetch_one_record(id)]) @app.route('/api/ebi/sample-<id>.xml', methods=['GET']) def ebi_sample(id): - meta = fetch_sample_metadata(id)[0] + meta,varlist = fetch_sample(id)[0] page = render_template('ebi-sample.xml',sampleid=id,sequencer=meta['sequencer']['value'],date=meta['date']['value'],specimen=meta['specimen']['value']) return page @app.route('/api/search', methods=['GET']) def search(): """ - Execute a 'global search' + Execute a 'global search'. Currently just duplicates fetch one + sample. Should be more flexible FIXME. """ s = request.args.get('s') - if s == "": - s = "MT326090.1" - query = """ - PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> - PREFIX sio: <http://semanticscience.org/resource/> - PREFIX edam: <http://edamontology.org/> - select distinct ?id ?seq ?info - { - ?sample sio:SIO_000115 "%s" . - ?sample sio:SIO_000115 ?id . - ?seq pubseq:sample ?sample . - ?sample edam:data_2091 ?info . - } limit 100 - """ % s - payload = {'query': query, 'format': 'json'} - r = requests.get(sparqlURL, params=payload) - result = r.json()['results']['bindings'] - # metadata = file.name(seq)+"/metadata.yaml" - print(result) - return jsonify([{ - 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), - 'info': x['info']['value'], - } for x in result]) + if s == "": s = "MT326090.1" + return jsonify([fetch_one_record(s)]) diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index b620946..504f03c 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -34,6 +34,7 @@ if not os.path.isfile('bh20sequploader/main.py'): print("WARNING: run FLASK from the root of the source repository!", file=sys.stderr) app = Flask(__name__, static_url_path='/static', static_folder='static') +app.config['JSON_SORT_KEYS'] = False # Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. # We will enforce the limit ourselves and set a higher safety limit here. @@ -252,7 +253,7 @@ FORM_ITEMS = load_schema_generate_form() def get_feed_items(name, start=0, stop=9): redis_client = redis.Redis(host=os.environ.get('HOST', 'localhost'), port=os.environ.get('PORT', 6379), - db=os.environ.get('REDIS_DB', 0)) + db=os.environ.get('REDIS_DB', 0)) feed_items = [] try: for el in redis_client.zrevrange(name, start, stop): @@ -272,12 +273,23 @@ def send_home(): """ Send the front page. """ + (tweets, + commits, + pubmed_articles, + arxiv_articles) = [get_feed_items(x) for x in ["bh20-tweet-score:", + "bh20-commit-score:", + "bh20-pubmed-score:", + "bh20-arxiv-score:"]] return render_template( 'home.html', menu='HOME', - tweets=get_feed_items("bh20-tweet-score:"), - commits=get_feed_items("bh20-commit-score:"), - pubmed_articles=get_feed_items("bh20-pubmed-score:"), - arxiv_articles=get_feed_items("bh20-arxiv-score:"), + all_items=list(itertools.chain(tweets, + commits, + pubmed_articles, + arxiv_articles)), + tweets=tweets, + commits=commits, + pubmed_articles=pubmed_articles, + arxiv_articles=arxiv_articles, load_map=True) @@ -750,8 +762,8 @@ union # http://covid19.genenetwork.org/resource/lugli-4zz18-gx0ifousk9yu0ql m = re.match(r"http://collections.lugli.arvadosapi.com/c=([^/]*)/sequence.fasta|http://covid19.genenetwork.org/resource/(.*)", sequenceuri) collection = m.group(1) or m.group(2) - fastauri = f"http://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta" - metauri = f"http://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml" + fastauri = f"https://collections.lugli.arvadosapi.com/c={collection}/sequence.fasta" + metauri = f"https://collections.lugli.arvadosapi.com/c={collection}/metadata.yaml" locationuri=sample['geo']['value'] location=sample['geoname']['value'] date=sample['date']['value'] diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css index fbc721e..e2f0c83 100644 --- a/bh20simplewebuploader/static/main.css +++ b/bh20simplewebuploader/static/main.css @@ -567,6 +567,7 @@ input[name="feed-tabs"] ~ .tab { display: none; } +#tab-all-items:checked ~ .tab.content-all-items, #tab-pubmed-articles:checked ~ .tab.content-pubmed-articles, #tab-arxiv-articles:checked ~ .tab.content-arxiv-articles, #tab-tweets:checked ~ .tab.content-tweets, diff --git a/bh20simplewebuploader/templates/home.html b/bh20simplewebuploader/templates/home.html index a880f81..23f48bf 100644 --- a/bh20simplewebuploader/templates/home.html +++ b/bh20simplewebuploader/templates/home.html @@ -29,7 +29,9 @@ </div> <div id="feed"> - <input name="feed-tabs" type="radio" id="tab-arxiv-articles" checked/> + <input name="feed-tabs" type="radio" id="tab-all-items" checked/> + <label for="tab-all-items">All Items</label> + <input name="feed-tabs" type="radio" id="tab-arxiv-articles"/> <label for="tab-arxiv-articles">Arxiv</label> <input name="feed-tabs" type="radio" id="tab-pubmed-articles"/> <label for="tab-pubmed-articles">Pubmed</label> @@ -37,6 +39,73 @@ <label for="tab-tweets">Tweets</label> <input name="feed-tabs" type="radio" id="tab-commits"/> <label for="tab-commits">Commits</label> + <ul class="tab content-all-items"> + <!-- Begin News --> + {% if all_items %} + {% for item in all_items|sort(reverse=true, attribute="score")%} + <li> + {% if item['authors'] %} + <!-- Arxiv article --> + <p> + <b>[arxiv]</b> + <a href="{{ item['url'] }}" target="_blank"> + {{item['title']}} + </a> + <br/> + <b>Authors:</b> {{ item['authors'] }} + <br/> + <b>Abstract:</b> {{ item['abstract']}}... + <br/> + <b>Submitted:</b> {{ item['submission']}} + </p> + + {% elif item['full-authors'] %} + <!-- Pubmed Article --> + <p><b>[Pubmed]:</b> + <a href="https://pubmed.ncbi.nlm.nih.gov/{{ item['docsum-pmid'] }}" target="_blank"><b>Summary:</b> + {{ item['summary'] }} + </a> <br/> + <b>Full Authors:</b> {{ item['full-authors'] }} <br/> + <b>Short Authors:</b> {{ item['short-authors'] }} <br/> + <b>Citation:</b> {{ item['citation'] }} <br/> + <b>Short Journal Citation:</b> {{ item['short-journal-citation'] }} <br/> + </p> + + {% elif item['tweet'] %} + <!-- Tweets --> + <p> + <b>[Tweet]:</b> + {{ item['tweet']|urlize(40, target="_blank")}} + <small> + <a href="{{ item['url'] }}" target="_blank">source</a></small> + <br/> + by {{ item['author'] }} + <br/> + <small>{{ item['timeposted'] }}</small> + </p> + + {% elif item['repository-url'] %} + <!-- Commits --> + <p> + <b>[Commit]:</b> + <a href="{{ item.url }}" target="_blank"> + {{ item.hash.split(":")[-1][:7] }}: {{ item.content }} + </a> + <br/> + <small> + <a href="{{ item['repository-url'] }}" target="_blank"> {{ item.author }}/{{ item.repository }}</a> + on {{ item.timeposted }} + </small> + </p> + {% endif %} + </li> + {%endfor%} + + {% else %} + There are no items to display :( + {% endif %} + <!-- End News --> + </ul> <ul class="tab content-arxiv-articles"> {% if arxiv_articles %} {% for article in arxiv_articles|sort(reverse=true, attribute="score")%} diff --git a/example/esr_example.yaml b/example/esr_example.yaml index c97d0bf..c7bdb30 100644 --- a/example/esr_example.yaml +++ b/example/esr_example.yaml @@ -15,7 +15,7 @@ sample: collection_date: "2020-02-26" collection_location: https://www.wikidata.org/wiki/Q37100 specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] - source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] ?? + source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] additional_collection_information: Optional free text field for additional information virus: @@ -23,7 +23,7 @@ virus: virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 technology: - sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION alignment_protocol: https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" additional_technology_information: "Artic V3 workflow" diff --git a/example/uthsc_example.yaml b/example/uthsc_example.yaml index 661cf60..589a7a5 100644 --- a/example/uthsc_example.yaml +++ b/example/uthsc_example.yaml @@ -23,7 +23,7 @@ virus: virus_strain: SARS-CoV-2/human/USA/AL_UT14/2020 technology: - sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] // Nanopore MinION + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION alignment_protocol: guppy assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" additional_technology_information: Optional free text field for additional information diff --git a/test/rest-api.org b/test/rest-api.org index 66639c3..2ea2b11 100644 --- a/test/rest-api.org +++ b/test/rest-api.org @@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version } #+end_src +The current API can fetch data + +#+begin_src js +curl http://covid19.genenetwork.org/api/search?s=MT533203.1 +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence" + } +] + +curl http://covid19.genenetwork.org/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "date": "2020-04-27", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] +#+end_src + + The Python3 version is #+begin_src python :session :exports both diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index 1962daa..e9e7315 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -6,28 +6,19 @@ # # See also directory .guix-run and README.md -BATCH_SIZE=5000 - import argparse -import json -import os -import requests import sys -import xml.etree.ElementTree as ET -from datetime import date, datetime -from dateutil.parser import parse + +from Bio import Entrez parser = argparse.ArgumentParser() parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False) args = parser.parse_args() -from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME - -# min_acceptable_collection_date = datetime(2019, 12, 1) +BATCH_SIZE = 5000 -today_date = date.today().strftime("%Y.%m.%d") +Entrez.email = 'another_email@gmail.com' # FIXME skip = set() if args.skip: @@ -36,10 +27,11 @@ if args.skip: for line in content: skip.add(line.strip()) -print(f"Skip size is {len(skip)}",file=sys.stderr) +print(f"Skip size is {len(skip)}", file=sys.stderr) # Try to search several strings TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP'] @@ -47,22 +39,27 @@ ids = set() for term in TERMS: num_read = BATCH_SIZE retstart = 0 + while num_read == BATCH_SIZE: record = Entrez.read( - Entrez.esearch(db='nuccore', term=term, idtype='acc', - retstart=retstart, retmax=BATCH_SIZE) + Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE) ) + idlist = record['IdList'] new_ids = set(idlist) num_read = len(new_ids) - print(num_read,":",idlist[0],file=sys.stderr) retstart += num_read - new_ids.difference_update(skip) # remove skip ids + + print(num_read, ":", idlist[0], file=sys.stderr) + + new_ids.difference_update(skip) # remove skip ids new_ids = set([id for id in new_ids if id[:2] not in PREFIX]) - ids.update(new_ids) # add to total set - print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr) + ids.update(new_ids) # add to total set + + print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr) + if args.max and len(ids) > args.max: - print(f"Stopping past #{args.max} items",file=sys.stderr) + print(f"Stopping past #{args.max} items", file=sys.stderr) break for id in ids: diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 85d615c..026c03f 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -111,7 +111,8 @@ def get_metadata(id, gbseq): # print(n,file=sys.stderr) if n != 'Unpublished': institute,address = n.split(',',1) - submitter.submitter_name = institute.split(') ')[1] + if ")" in institute: + submitter.submitter_name = institute.split(')')[1] submitter.submitter_address = address.strip() except AttributeError: pass diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index 9414864..1a8035d 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -33,43 +33,47 @@ states = {} for xmlfn in args.files: print(f"--- Reading {xmlfn}") - with gzip.open(xmlfn, 'r') as f: - xml = f.read().decode() - tree = ET.fromstring(xml) - for gb in tree.findall('./GBSeq'): - valid = None - error = None - meta = {} - id = gb.find("GBSeq_locus").text - basename = dir+"/"+id - print(f" parsing {id}") - try: - valid,meta = genbank.get_metadata(id,gb) - if valid: - # --- write JSON - jsonfn = basename + ".json" - with open(jsonfn, 'w') as outfile: - print(f" writing {jsonfn}") - json.dump(meta, outfile, indent=4) - # --- write FASTA - fa = basename+".fa" - seq = genbank.get_sequence(id,gb) - print(f" writing {fa}") - with open(fa,"w") as f2: - f2.write(f"> {id}\n") - f2.write(seq) - # print(seq) - except genbank.GBError as e: - error = f"{e} for {id}" - print(error,file=sys.stderr) - valid = False - state = {} - state['valid'] = valid - if error: - state['error'] = error - if meta['warnings']: - state['warnings'] = meta['warnings'] - states[id] = state + try: + with gzip.open(xmlfn, 'r') as f: + xml = f.read().decode() + except Exception: + with open(xmlfn, 'r') as f: + xml = f.read() + tree = ET.fromstring(xml) + for gb in tree.findall('./GBSeq'): + valid = None + error = None + meta = {} + id = gb.find("GBSeq_locus").text + basename = dir+"/"+id + print(f" parsing {id}") + try: + valid,meta = genbank.get_metadata(id,gb) + if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA + fa = basename+".fa" + seq = genbank.get_sequence(id,gb) + print(f" writing {fa}") + with open(fa,"w") as f2: + f2.write(f"> {id}\n") + f2.write(seq) + # print(seq) + except genbank.GBError as e: + error = f"{e} for {id}" + print(error,file=sys.stderr) + valid = False + state = {} + state['valid'] = valid + if error: + state['error'] = error + if meta['warnings']: + state['warnings'] = meta['warnings'] + states[id] = state statefn = dir + '/state.json' with open(statefn, 'w') as outfile: diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py index 3efc67a..96920a5 100644 --- a/workflows/pull-data/genbank/utils.py +++ b/workflows/pull-data/genbank/utils.py @@ -1,5 +1,6 @@ import os + def is_integer(string_to_check): try: int(string_to_check) @@ -7,19 +8,26 @@ def is_integer(string_to_check): except ValueError: return False + def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] + def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): - # Check duplicated entry looking at all dictionaries + """ + Check duplicated entry by looking in all dictionaries + """ + field_to_term_to_uri_dict = {} - path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + path_dict_xxx_csv_list = [ + os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv') + ] for path_dict_xxx_csv in path_dict_xxx_csv_list: - print('Read {}'.format(path_dict_xxx_csv)) + print(f'Read {path_dict_xxx_csv}') with open(path_dict_xxx_csv) as f: for line in f: @@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + print(f'Warning: in the dictionaries there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[term] = uri @@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict[field]: - print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[field][term] = uri - return field_to_term_to_uri_dict
\ No newline at end of file + return field_to_term_to_uri_dict diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py new file mode 100755 index 0000000..ef4edde --- /dev/null +++ b/workflows/tools/pubseq-fetch-data.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import sys +import time + +parser = argparse.ArgumentParser(description=""" + +Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs +can be passed in on the command line or in a file. + +""") +parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') +parser.add_argument('--out', type=str, help='Directory to write to', +required=True) +parser.add_argument('--ids', type=str, help='File with ids', required=False) +parser.add_argument('id', nargs='*', help='id(s)') +args = parser.parse_args() + +dir = args.out +if not os.path.exists(dir): + raise Exception(f"Directory {dir} does not exist") + +ids = args.id +if (len(ids)==0): + print(f"Reading {args.ids}") + with open(args.ids) as f: + ids = [ l.strip() for l in f.readlines() ] + +for id in ids: + print(id) + jsonfn = dir+"/"+id+".json" + if not os.path.exists(jsonfn): + count = 0 + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + while not r: + count += 1 + if count>10: raise Exception(f"Can not find record for {id}") + time.sleep(15) + r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") + m_url = r.json()[0]['metadata'] + mr = requests.get(m_url) + with open(dir+"/"+id+".json","w") as outf: + outf.write(mr.text) + if args.fasta: + fastafn = dir+"/"+id+".fa" + if os.path.exists(fastafn): continue + fa_url = r.json()[0]['fasta'] + fr = requests.get(fa_url) + with open(fastafn,"w") as outf: + outf.write(fr.text) + diff --git a/workflows/tools/sparql-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/sparql-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error |