scripts/db_enrichment/country_enrichment.py - bh20-seq-resource - Tool to upload SARS-CoV-2 sequences to BH20 Arvados instance and orchestrate analysis

# This script by @LLTommy queries the main SPARQL end point to find what
# collections are missing country information for GPS coordinates, such
#
# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
#    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
#    ns1:P625 "Point(172.0 -43.6)" .
#
# See also the ./readme.md

import requests
import csv
from rdflib import Graph, Literal, RDF, URIRef
import time


sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'}


def callSPARQL(query):
    payload = {'query': query, 'format': 'json'}
    try:
        r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
        time.sleep(1)
        # Slow process down, in case we did send too many processes. Sleep, then try again
        if 'Retry_After' in r.headers:
            print(r.headers)
            time.sleep(45)
            r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)

        result = r.json()['results']['bindings']
    except Exception as e:
        print("Error during SPARQL call. We abort the process and have to investigate")
        print(r)
        print(r.content)
        print(r.url)
        raise Exception(e)
    return result


g = Graph()

test_query="""
# Use with https://query.wikidata.org/
SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
    BIND (XXX as ?a) .
    OPTIONAL {
        ?a wdt:P625 ?coor.
    }
    ?a rdfs:label ?label .
    ?a wdt:P17 ?country.
    ?country rdfs:label ?country_label .
    OPTIONAL {
        ?country wdt:P30 ?continent.
        ?continent rdfs:label ?continent_label
        FILTER (lang(?continent_label)='en')
    }
    FILTER (lang(?country_label)='en')
    FILTER (lang(?label)='en')
}
"""

# wdt:P625 are GEO coordinates

query = """
construct {
    ?a wdt:P625 ?c.
    ?a rdfs:label ?label .
    ?a wdt:P17 ?country.
    ?country rdfs:label ?country_label .
    ?country wdt:P30 ?continent .
    ?continent rdfs:label ?continent_label .
} WHERE
{
    BIND (XXX as ?a) .
    ?a rdfs:label ?label .
    ?a wdt:P17 ?country.
    ?country rdfs:label ?country_label .
    ?country wdt:P30 ?continent.
    ?continent rdfs:label ?continent_label
    FILTER (lang(?continent_label)='en')
    FILTER (lang(?country_label)='en')
    FILTER (lang(?label)='en')
    OPTIONAL { ?a wdt:P625 ?c }
}
"""""

outputFile = 'input_location.csv'

with open(outputFile, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    counter=0
    for row in spamreader:
        counter=counter+1

        try:
            testq = test_query.replace("XXX", "<"+row[0]+">")
            print(testq)
            tmpquery=query.replace("XXX", "<"+row[0]+">")
            print(tmpquery)

            returnReply=callSPARQL(tmpquery)
            print(returnReply)

            for row in returnReply:
                print(row)
                sub=URIRef(row['subject']['value'])
                pred=URIRef(row['predicate']['value'])

                if row['object']['type'] == 'uri':
                    obj =  URIRef(row['object']['value'])

                elif row['object']['type'] == 'literal':
                    obj= Literal(row['object']['value'])
                g.add(( sub, pred, obj ))

        except Exception as e:
            print(e)
            raise

print(g.serialize(format='n3').decode("utf-8"))
g.serialize(destination='enriched_output.txt', format='turtle')