scripts/db_enrichment/country_enrichment.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

import requests
import csv
from rdflib import Graph, Literal, RDF, URIRef
import time


sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'}


def callSPARQL(query):
    payload = {'query': query, 'format': 'json'}
    try:
        r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
        time.sleep(1)
        # Slow process down, in case we did send too many processes. Sleep, then try again
        if 'Retry_After' in r.headers:
            print(r.headers)
            time.sleep(45)
            r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)

        result = r.json()['results']['bindings']
    except Exception as e:
        print("Error during SPARQL call. We abort the process and have to investigate")
        print(r)
        print(r.content)
        print(r.url)
        raise Exception(e)
    return result


g = Graph()


query = """
construct {
    ?a wdt:P625 ?c. 
    ?a rdfs:label ?label .
    ?a wdt:P17 ?country.      
    ?country rdfs:label ?country_label . 
    ?country wdt:P30 ?continent. 
    ?continent rdfs:label ?continent_label   
} WHERE 
{ 
    BIND (XXX as ?a) . 
    ?a wdt:P625 ?c. 
    ?a rdfs:label ?label .
    ?a wdt:P17 ?country.      
    ?country rdfs:label ?country_label .    
    ?country wdt:P30 ?continent. 
    ?continent rdfs:label ?continent_label
    FILTER (lang(?continent_label)='en')           
    FILTER (lang(?country_label)='en')
    FILTER (lang(?label)='en') 

}  
"""""

outputFile = 'input_location.csv'

with open(outputFile, 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    counter=0
    for row in spamreader:
        counter=counter+1

        try:
            tmpquery=query.replace("XXX", "<"+row[0]+">")
            print(tmpquery)

            returnReply=callSPARQL(tmpquery)
            print(returnReply)

            for row in returnReply:
                print(row)
                sub=URIRef(row['subject']['value'])
                pred=URIRef(row['predicate']['value'])

                if row['object']['type'] == 'uri':
                    obj =  URIRef(row['object']['value'])

                elif row['object']['type'] == 'literal':
                    obj= Literal(row['object']['value'])
                g.add(( sub, pred, obj ))

        except Exception as e:
            print(e)
            raise

print(g.serialize(format='n3').decode("utf-8"))
g.serialize(destination='enriched_ouput.txt', format='turtle')