From 3541089aa8af5d229e669eb38d3735cd2b0b8a05 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 31 Dec 2020 14:17:42 +0000 Subject: genbank: sparql-fetch-ids --- workflows/pull-data/genbank/README.md | 6 ++- workflows/pull-data/genbank/sparql-fetch-ids | 57 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'workflows') diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index ee67e70..0204dd0 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -1,4 +1,4 @@ -Pipeline: +# pipeline ```sh # --- get list of IDs already in PubSeq @@ -11,3 +11,7 @@ genbank-fetch-ids --dir ~/tmp/pubseq > genbank_ids.txt for id in genbank_ids.txt: transform-genbank-xml2yamlfa --dir ~/tmp/genbank id --outdir ~/tmp/pubseq ``` + +# TODO + +- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/pull-data/genbank/sparql-fetch-ids b/workflows/pull-data/genbank/sparql-fetch-ids index e69de29..9a8b8ee 100755 --- a/workflows/pull-data/genbank/sparql-fetch-ids +++ b/workflows/pull-data/genbank/sparql-fetch-ids @@ -0,0 +1,57 @@ +#!/usr/bin/env ruby + +require 'net/http' +require 'json' +require 'ostruct' +require 'erb' + +SPARQL_HEADER=" +prefix rdfs: +prefix rdf: +prefix dc: +prefix schema: +PREFIX pubseq: +" + +# Build a SPARQL query, submit and return results. Apply transform lambda + +def sparql q, transform = nil + q = SPARQL_HEADER+q + api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(q)}" + + response = Net::HTTP.get_response(URI.parse(api_url)) + data = JSON.parse(response.body) + vars = data['head']['vars'] + results = data['results']['bindings'] + results.map { |rec| + res = {} + vars.each { |name| + res[name.to_sym] = rec[name]['value'] + } + if transform + transform.call(res) + else + res + end + } +end + +MAX=5_000 + +start = 0 +num = MAX +begin + query = " +select distinct ?id where { + +?arvid ?id . + +} limit #{num} offset #{start} +" + list = sparql(query, lambda { |rec| rec[:id] }) + list.each do | l | + print(l,"\n") + end + start += num + $stderr.print(start,":",list.first,"\n") +end while list.size == MAX -- cgit v1.2.3