From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 5 Jan 2021 09:34:26 +0000 Subject: API: fix returned record to include original metadata --- bh20simplewebuploader/api.py | 34 ++++++++++++++++++++++++++++++++-- test/rest-api.org | 29 +++++++++++++++++++++++++++++ workflows/pull-data/genbank/README.md | 12 ++++++++++-- workflows/tools/pubseq-fetch-ids | 2 +- 4 files changed, 72 insertions(+), 5 deletions(-) diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py index b1b505f..11c74f2 100644 --- a/bh20simplewebuploader/api.py +++ b/bh20simplewebuploader/api.py @@ -7,6 +7,9 @@ import sys from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify from bh20simplewebuploader.main import app, sparqlURL +PUBSEQ="http://covid19.genenetwork.org" +ARVADOS="https://collections.lugli.arvadosapi.com/c=" + # Helper functions def fetch_sample_metadata(id): @@ -42,13 +45,40 @@ def version(): @app.route('/api/sample/.json') def sample(id): + """ + +API sample should return a record pointing to other resources, +notably: permalink, original metadata record and the fasta +data. + +curl http://localhost:5067/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "date": "2020-04-27", + "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml", + "permalink": "http://covid19.genenetwork.org/resource/MT533203.1", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] + + +""" # metadata = file.name(seq)+"/metadata.yaml" meta = fetch_sample_metadata(id) print(meta) + # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml return jsonify([{ 'id': x['id']['value'], - 'fasta': x['seq']['value'], - 'collection': os.path.dirname(x['seq']['value']), + 'collection': x['seq']['value'], + 'permalink': PUBSEQ+'/resource/'+x['id']['value'], + 'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta', + 'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml', 'date': x['date']['value'], 'info': x['info']['value'], 'specimen': x['specimen']['value'], diff --git a/test/rest-api.org b/test/rest-api.org index 66639c3..2ea2b11 100644 --- a/test/rest-api.org +++ b/test/rest-api.org @@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version } #+end_src +The current API can fetch data + +#+begin_src js +curl http://covid19.genenetwork.org/api/search?s=MT533203.1 +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence" + } +] + +curl http://covid19.genenetwork.org/api/sample/MT533203.1.json +[ + { + "collection": "http://covid19.genenetwork.org/resource", + "date": "2020-04-27", + "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks", + "id": "MT533203.1", + "info": "http://identifiers.org/insdc/MT533203.1#sequence", + "mapper": "minimap v. 2.17", + "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632", + "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831" + } +] +#+end_src + + The Python3 version is #+begin_src python :session :exports both diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids index 19b2d82..f5920ec 100755 --- a/workflows/tools/pubseq-fetch-ids +++ b/workflows/tools/pubseq-fetch-ids @@ -2,7 +2,7 @@ # # Use a SPARQL query to fetch all IDs in the PubSeq database # -# sparql-fetch-ids > pubseq_ids.txt +# pubseq-fetch-ids > pubseq_ids.txt # # Note: requires Ruby 3.x. Older Ruby gives a syntax error -- cgit v1.2.3