aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-12-31 14:17:42 +0000
committerPjotr Prins2020-12-31 14:17:42 +0000
commit3541089aa8af5d229e669eb38d3735cd2b0b8a05 (patch)
treef06614daeefe9c466828528196b6fc75332330a8
parent918b3d2270acef9ebbc824dabc9c9433e0b30868 (diff)
downloadbh20-seq-resource-3541089aa8af5d229e669eb38d3735cd2b0b8a05.tar.gz
bh20-seq-resource-3541089aa8af5d229e669eb38d3735cd2b0b8a05.tar.lz
bh20-seq-resource-3541089aa8af5d229e669eb38d3735cd2b0b8a05.zip
genbank: sparql-fetch-ids
-rw-r--r--workflows/pull-data/genbank/README.md6
-rwxr-xr-xworkflows/pull-data/genbank/sparql-fetch-ids57
2 files changed, 62 insertions, 1 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index ee67e70..0204dd0 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -1,4 +1,4 @@
-Pipeline:
+# pipeline
```sh
# --- get list of IDs already in PubSeq
@@ -11,3 +11,7 @@ genbank-fetch-ids --dir ~/tmp/pubseq > genbank_ids.txt
for id in genbank_ids.txt:
transform-genbank-xml2yamlfa --dir ~/tmp/genbank id --outdir ~/tmp/pubseq
```
+
+# TODO
+
+- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank
diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids
index e69de29..9a8b8ee 100755
--- a/workflows/pull-data/genbank/sparql-fetch-ids
+++ b/workflows/pull-data/genbank/sparql-fetch-ids
@@ -0,0 +1,57 @@
+#!/usr/bin/env ruby
+
+require 'net/http'
+require 'json'
+require 'ostruct'
+require 'erb'
+
+SPARQL_HEADER="
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+prefix dc: <http://purl.org/dc/terms/>
+prefix schema: <https://schema.org/>
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+"
+
+# Build a SPARQL query, submit and return results. Apply transform lambda
+
+def sparql q, transform = nil
+ q = SPARQL_HEADER+q
+ api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}"
+
+ response = Net::HTTP.get_response(URI.parse(api_url))
+ data = JSON.parse(response.body)
+ vars = data['head']['vars']
+ results = data['results']['bindings']
+ results.map { |rec|
+ res = {}
+ vars.each { |name|
+ res[name.to_sym] = rec[name]['value']
+ }
+ if transform
+ transform.call(res)
+ else
+ res
+ end
+ }
+end
+
+MAX=5_000
+
+start = 0
+num = MAX
+begin
+ query = "
+select distinct ?id where {
+
+?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
+
+} limit #{num} offset #{start}
+"
+ list = sparql(query, lambda { |rec| rec[:id] })
+ list.each do | l |
+ print(l,"\n")
+ end
+ start += num
+ $stderr.print(start,":",list.first,"\n")
+end while list.size == MAX