aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/sparql-fetch-ids
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 08:58:38 +0000
committerPjotr Prins2021-01-04 08:58:38 +0000
commit1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree34cc42ef12b81c05be8a57ca2a973b97e52f8461 /workflows/pull-data/genbank/sparql-fetch-ids
parentba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
downloadbh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip
Started on normalization
Diffstat (limited to 'workflows/pull-data/genbank/sparql-fetch-ids')
-rwxr-xr-xworkflows/pull-data/genbank/sparql-fetch-ids67
1 files changed, 0 insertions, 67 deletions
diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids
deleted file mode 100755
index 19b2d82..0000000
--- a/workflows/pull-data/genbank/sparql-fetch-ids
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env ruby
-#
-# Use a SPARQL query to fetch all IDs in the PubSeq database
-#
-# sparql-fetch-ids > pubseq_ids.txt
-#
-# Note: requires Ruby 3.x. Older Ruby gives a syntax error
-
-require 'net/http'
-require 'json'
-require 'ostruct'
-require 'erb'
-require 'pp'
-
-MAX=5_000
-
-SPARQL_HEADER="
-prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-prefix dc: <http://purl.org/dc/terms/>
-prefix schema: <https://schema.org/>
-PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-"
-
-# Build a SPARQL query, submit and return results. Apply transform
-# lambda when passed in
-def sparql query, transform = nil
- api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
-
- response = Net::HTTP.get_response(URI.parse(api_url))
- data = JSON.parse(response.body,symbolize_names: true)
- data => { head: { vars: }, results: { bindings: results} }
- vars = vars.map { |v| v.to_sym }
- results.map { |rec|
- # return results after transforming to a Hash and applying the
- # optional transform lambda. Note the transform can not only
- # reduce results, or create an array, but also may transform into
- # an OpenStruct.
- res = {}
- vars.each { |name| res[name] = rec[name][:value] }
- if transform
- transform.call(res)
- else
- res
- end
- }
-end
-
-start = 0
-num = MAX
-begin
- query = "
-SELECT DISTINCT ?id
-FROM <http://covid-19.genenetwork.org/graph/metadata.ttl>
-WHERE {
-
- ?arvid <http://biohackathon.org/bh20-seq-schema/original_fasta_label> ?id .
-
-} LIMIT #{num} OFFSET #{start}
-"
- list = sparql(query, lambda { |rec| rec[:id] })
- list.each do | l |
- print(l,"\n")
- end
- $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
- start += num
-end while list.size == MAX