aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/sparql-fetch-ids
blob: 19b2d821a13f9ac503b8d37b9b8c0f7280580944 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env ruby
#
# Use a SPARQL query to fetch all IDs in the PubSeq database
#
#   sparql-fetch-ids > pubseq_ids.txt
#
# Note: requires Ruby 3.x. Older Ruby gives a syntax error

require 'net/http'
require 'json'
require 'ostruct'
require 'erb'
require 'pp'

MAX=5_000

SPARQL_HEADER="
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix dc: <http://purl.org/dc/terms/>
prefix schema: <https://schema.org/>
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
"

# Build a SPARQL query, submit and return results. Apply transform
# lambda when passed in
def sparql query, transform = nil
  api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"

  response = Net::HTTP.get_response(URI.parse(api_url))
  data = JSON.parse(response.body,symbolize_names: true)
  data => { head: { vars: }, results: { bindings: results} }
  vars = vars.map { |v| v.to_sym }
  results.map { |rec|
    # return results after transforming to a Hash and applying the
    # optional transform lambda. Note the transform can not only
    # reduce results, or create an array, but also may transform into
    # an OpenStruct.
    res = {}
    vars.each { |name| res[name] = rec[name][:value] }
    if transform
      transform.call(res)
    else
      res
    end
  }
end

start = 0
num = MAX
begin
  query = "
SELECT DISTINCT ?id
FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
WHERE {

  ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .

} LIMIT #{num} OFFSET #{start}
"
  list = sparql(query, lambda { |rec| rec[:id] })
  list.each do | l |
    print(l,"\n")
  end
  $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
  start += num
end while list.size == MAX