From 3c09a92423408d01b64e1b842c6b96778939d098 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 07:11:13 +0000
Subject: Rename

---
 workflows/tools/pubseq-fetch-ids | 67 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100755 workflows/tools/pubseq-fetch-ids

(limited to 'workflows/tools/pubseq-fetch-ids')
diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
new file mode 100755
index 0000000..19b2d82
--- /dev/null
+++ b/workflows/tools/pubseq-fetch-ids
@@ -0,0 +1,67 @@
+#!/usr/bin/env ruby
+#
+# Use a SPARQL query to fetch all IDs in the PubSeq database
+#
+#   sparql-fetch-ids > pubseq_ids.txt
+#
+# Note: requires Ruby 3.x. Older Ruby gives a syntax error
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+require 'pp'
+
+MAX=5_000
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform
+# lambda when passed in
+def sparql query, transform = nil
+  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
+
+  response = Net::HTTP.get_response(URI.parse(api_url))
+  data = JSON.parse(response.body,symbolize_names: true)
+  data => { head: { vars: }, results: { bindings: results} }
+  vars = vars.map { |v| v.to_sym }
+  results.map { |rec|
+    # return results after transforming to a Hash and applying the
+    # optional transform lambda. Note the transform can not only
+    # reduce results, or create an array, but also may transform into
+    # an OpenStruct.
+    res = {}
+    vars.each { |name| res[name] = rec[name][:value] }
+    if transform
+      transform.call(res)
+    else
+      res
+    end
+  }
+end
+
+start = 0
+num = MAX
+begin
+  query = "
+SELECT DISTINCT ?id
+FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
+WHERE {
+
+  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} LIMIT #{num} OFFSET #{start}
+"
+  list = sparql(query, lambda { |rec| rec[:id] })
+  list.each do | l |
+    print(l,"\n")
+  end
+  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
+  start += num
+end while list.size == MAX
-- 
cgit 1.4.1


From 9d75ce088e6388bf23ae077fd06b2a3f51be1bda Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 5 Jan 2021 09:34:26 +0000
Subject: API: fix returned record to include original metadata

---
 bh20simplewebuploader/api.py          | 34 ++++++++++++++++++++++++++++++++--
 test/rest-api.org                     | 29 +++++++++++++++++++++++++++++
 workflows/pull-data/genbank/README.md | 12 ++++++++++--
 workflows/tools/pubseq-fetch-ids      |  2 +-
 4 files changed, 72 insertions(+), 5 deletions(-)

(limited to 'workflows/tools/pubseq-fetch-ids')

diff --git a/bh20simplewebuploader/api.py b/bh20simplewebuploader/api.py
index b1b505f..11c74f2 100644
--- a/bh20simplewebuploader/api.py
+++ b/bh20simplewebuploader/api.py
@@ -7,6 +7,9 @@ import sys
 from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
 from bh20simplewebuploader.main import app, sparqlURL
 
+PUBSEQ="http://covid19.genenetwork.org"
+ARVADOS="https://collections.lugli.arvadosapi.com/c="
+
 # Helper functions
 
 def fetch_sample_metadata(id):
@@ -42,13 +45,40 @@ def version():
 
 @app.route('/api/sample/<id>.json')
 def sample(id):
+    """
+
+API sample should return a record pointing to other resources,
+notably: permalink, original metadata record and the fasta
+data.
+
+curl http://localhost:5067/api/sample/MT533203.1.json
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "date": "2020-04-27",
+    "fasta": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/sequence.fasta",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence",
+    "mapper": "minimap v. 2.17",
+    "metadata": "https://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml",
+    "permalink": "http://covid19.genenetwork.org/resource/MT533203.1",
+    "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632",
+    "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831"
+  }
+]
+
+
+"""
     # metadata = file.name(seq)+"/metadata.yaml"
     meta = fetch_sample_metadata(id)
     print(meta)
+    # http://collections.lugli.arvadosapi.com/c=lugli-4zz18-uovend31hdwa5ks/metadata.yaml
     return jsonify([{
         'id': x['id']['value'],
-        'fasta': x['seq']['value'],
-        'collection': os.path.dirname(x['seq']['value']),
+        'collection': x['seq']['value'],
+        'permalink': PUBSEQ+'/resource/'+x['id']['value'],
+        'fasta': ARVADOS+os.path.basename(x['seq']['value'])+'/sequence.fasta',
+        'metadata': ARVADOS+os.path.basename(x['seq']['value'])+'/metadata.yaml',
         'date': x['date']['value'],
         'info': x['info']['value'],
         'specimen': x['specimen']['value'],
diff --git a/test/rest-api.org b/test/rest-api.org
index 66639c3..2ea2b11 100644
--- a/test/rest-api.org
+++ b/test/rest-api.org
@@ -36,6 +36,35 @@ curl http://covid19.genenetwork.org/api/version
 }
 #+end_src
 
+The current API can fetch data
+
+#+begin_src js
+curl http://covid19.genenetwork.org/api/search?s=MT533203.1
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource",
+    "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence"
+  }
+]
+
+curl http://covid19.genenetwork.org/api/sample/MT533203.1.json
+[
+  {
+    "collection": "http://covid19.genenetwork.org/resource",
+    "date": "2020-04-27",
+    "fasta": "http://covid19.genenetwork.org/resource/lugli-4zz18-uovend31hdwa5ks",
+    "id": "MT533203.1",
+    "info": "http://identifiers.org/insdc/MT533203.1#sequence",
+    "mapper": "minimap v. 2.17",
+    "sequencer": "http://www.ebi.ac.uk/efo/EFO_0008632",
+    "specimen": "http://purl.obolibrary.org/obo/NCIT_C155831"
+  }
+]
+#+end_src
+
+
 The Python3 version is
 
 #+begin_src python :session :exports both
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index 5464d1d..188ff6f 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq
 
 ```sh
 # --- get list of IDs already in PubSeq
-../../tools/sparql-fetch-ids > pubseq_ids.txt
+../../tools/pubseq-fetch-ids > pubseq_ids.txt
+
 # --- get list of missing genbank IDs
 python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
 
@@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s
 
 ```
 
+## Validate GenBank data
+
+To pull the data from PubSeq use the list of pubseq ids generated
+above.
+
+
+
 # TODO
 
-- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
+- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
diff --git a/workflows/tools/pubseq-fetch-ids b/workflows/tools/pubseq-fetch-ids
index 19b2d82..f5920ec 100755
--- a/workflows/tools/pubseq-fetch-ids
+++ b/workflows/tools/pubseq-fetch-ids
@@ -2,7 +2,7 @@
 #
 # Use a SPARQL query to fetch all IDs in the PubSeq database
 #
-#   sparql-fetch-ids > pubseq_ids.txt
+#   pubseq-fetch-ids > pubseq_ids.txt
 #
 # Note: requires Ruby 3.x. Older Ruby gives a syntax error
 
-- 
cgit 1.4.1