aboutsummaryrefslogtreecommitdiff
path: root/scripts/db_enrichment/country_enrichment.py
diff options
context:
space:
mode:
authorlltommy2020-09-27 12:56:27 +0200
committerlltommy2020-09-27 12:56:27 +0200
commit71820c89e06b7d028ffdf76ddf01141733c78388 (patch)
treef7b99454ca5912c4edcafbf7fdbe445b214337a1 /scripts/db_enrichment/country_enrichment.py
parent7afd6b778b0deade1bf70062a10041e31a249af0 (diff)
downloadbh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.gz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.lz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.zip
Adding script supporting semantic enrichment
Diffstat (limited to 'scripts/db_enrichment/country_enrichment.py')
-rw-r--r--scripts/db_enrichment/country_enrichment.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
new file mode 100644
index 0000000..8dcf5f2
--- /dev/null
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -0,0 +1,91 @@
+import requests
+import csv
+from rdflib import Graph, Literal, RDF, URIRef
+import time
+
+
+sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'}
+
+
+def callSPARQL(query):
+ payload = {'query': query, 'format': 'json'}
+ try:
+ r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+ time.sleep(1)
+ # Slow process down, in case we did send too many processes. Sleep, then try again
+ if 'Retry_After' in r.headers:
+ print(r.headers)
+ time.sleep(45)
+ r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+
+ result = r.json()['results']['bindings']
+ except Exception as e:
+ print("Error during SPARQL call. We abort the process and have to investigate")
+ print(r)
+ print(r.content)
+ print(r.url)
+ raise Exception(e)
+ return result
+
+
+g = Graph()
+
+
+
+query = """
+construct {
+ ?a wdt:P625 ?c.
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+} WHERE
+{
+ BIND (XXX as ?a) .
+ ?a wdt:P625 ?c.
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+ FILTER (lang(?continent_label)='en')
+ FILTER (lang(?country_label)='en')
+ FILTER (lang(?label)='en')
+
+}
+"""""
+
+outputFile = 'input_location.csv'
+
+with open(outputFile, 'r') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
+ counter=0
+ for row in spamreader:
+ counter=counter+1
+
+ try:
+ tmpquery=query.replace("XXX", "<"+row[0]+">")
+ print(tmpquery)
+
+ returnReply=callSPARQL(tmpquery)
+ print(returnReply)
+
+ for row in returnReply:
+ print(row)
+ sub=URIRef(row['subject']['value'])
+ pred=URIRef(row['predicate']['value'])
+
+ if row['object']['type'] == 'uri':
+ obj = URIRef(row['object']['value'])
+
+ elif row['object']['type'] == 'literal':
+ obj= Literal(row['object']['value'])
+ g.add(( sub, pred, obj ))
+
+ except Exception as e:
+ print(e)
+ raise
+
+print(g.serialize(format='n3').decode("utf-8"))
+g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file