diff options
author | lltommy | 2020-09-27 12:56:27 +0200 |
---|---|---|
committer | lltommy | 2020-09-27 12:56:27 +0200 |
commit | 71820c89e06b7d028ffdf76ddf01141733c78388 (patch) | |
tree | f7b99454ca5912c4edcafbf7fdbe445b214337a1 /scripts/db_enrichment/country_enrichment.py | |
parent | 7afd6b778b0deade1bf70062a10041e31a249af0 (diff) | |
download | bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.gz bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.lz bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.zip |
Adding script supporting semantic enrichment
Diffstat (limited to 'scripts/db_enrichment/country_enrichment.py')
-rw-r--r-- | scripts/db_enrichment/country_enrichment.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py new file mode 100644 index 0000000..8dcf5f2 --- /dev/null +++ b/scripts/db_enrichment/country_enrichment.py @@ -0,0 +1,91 @@ +import requests +import csv +from rdflib import Graph, Literal, RDF, URIRef +import time + + +sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'} + + +def callSPARQL(query): + payload = {'query': query, 'format': 'json'} + try: + r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders) + time.sleep(1) + # Slow process down, in case we did send too many processes. Sleep, then try again + if 'Retry_After' in r.headers: + print(r.headers) + time.sleep(45) + r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders) + + result = r.json()['results']['bindings'] + except Exception as e: + print("Error during SPARQL call. We abort the process and have to investigate") + print(r) + print(r.content) + print(r.url) + raise Exception(e) + return result + + +g = Graph() + + + +query = """ +construct { + ?a wdt:P625 ?c. + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label +} WHERE +{ + BIND (XXX as ?a) . + ?a wdt:P625 ?c. + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label + FILTER (lang(?continent_label)='en') + FILTER (lang(?country_label)='en') + FILTER (lang(?label)='en') + +} +""""" + +outputFile = 'input_location.csv' + +with open(outputFile, 'r') as csvfile: + spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') + counter=0 + for row in spamreader: + counter=counter+1 + + try: + tmpquery=query.replace("XXX", "<"+row[0]+">") + print(tmpquery) + + returnReply=callSPARQL(tmpquery) + print(returnReply) + + for row in returnReply: + print(row) + sub=URIRef(row['subject']['value']) + pred=URIRef(row['predicate']['value']) + + if row['object']['type'] == 'uri': + obj = URIRef(row['object']['value']) + + elif row['object']['type'] == 'literal': + obj= Literal(row['object']['value']) + g.add(( sub, pred, obj )) + + except Exception as e: + print(e) + raise + +print(g.serialize(format='n3').decode("utf-8")) +g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file |