aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/sparql-fetch-ids
blob: 683044cd9d7471640aa39ce9a439efc5ea659da5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env ruby
#
# Use a SPARQL query to fetch all IDs in the PubSeq database
#
#   sparql-fetch-ids > pubseq_ids.txt
#

require 'net/http'
require 'json'
require 'ostruct'
require 'erb'

SPARQL_HEADER="
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix dc: <http://purl.org/dc/terms/>
prefix schema: <https://schema.org/>
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
"

# Build a SPARQL query, submit and return results. Apply transform
# lambda when passed in

def sparql q, transform = nil
  q = SPARQL_HEADER+q
  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}"

  response = Net::HTTP.get_response(URI.parse(api_url))
  data = JSON.parse(response.body)
  vars = data['head']['vars']
  results = data['results']['bindings']
  results.map { |rec|
    res = {}
    vars.each { |name|
      res[name.to_sym] = rec[name]['value']
    }
    if transform
      transform.call(res)
    else
      res
    end
  }
end

MAX=5_000

start = 0
num = MAX
begin
  query = "
SELECT DISTINCT ?id
FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
WHERE {

  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .

} LIMIT #{num} OFFSET #{start}
"
  list = sparql(query, lambda { |rec| rec[:id] })
  list.each do | l |
    print(l,"\n")
  end
  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
  start += num
end while list.size == MAX