aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/sparql-fetch-ids
blob: 9a8b8eeedde6d08e4f90b03f301783f6d6dfe219 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env ruby

require 'net/http'
require 'json'
require 'ostruct'
require 'erb'

SPARQL_HEADER="
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix dc: <http://purl.org/dc/terms/>
prefix schema: <https://schema.org/>
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
"

# Build a SPARQL query, submit and return results. Apply transform lambda

def sparql q, transform = nil
  q = SPARQL_HEADER+q
  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}"

  response = Net::HTTP.get_response(URI.parse(api_url))
  data = JSON.parse(response.body)
  vars = data['head']['vars']
  results = data['results']['bindings']
  results.map { |rec|
    res = {}
    vars.each { |name|
      res[name.to_sym] = rec[name]['value']
    }
    if transform
      transform.call(res)
    else
      res
    end
  }
end

MAX=5_000

start = 0
num = MAX
begin
  query = "
select distinct ?id where {

?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .

} limit #{num} offset #{start}
"
  list = sparql(query, lambda { |rec| rec[:id] })
  list.each do | l |
    print(l,"\n")
  end
  start += num
  $stderr.print(start,":",list.first,"\n")
end while list.size == MAX