From 71820c89e06b7d028ffdf76ddf01141733c78388 Mon Sep 17 00:00:00 2001 From: lltommy Date: Sun, 27 Sep 2020 12:56:27 +0200 Subject: Adding script supporting semantic enrichment --- scripts/db_enrichment/country_enrichment.py | 91 +++++++++++++++++++++++++++++ scripts/db_enrichment/input_location.csv | 2 + scripts/db_enrichment/readme.md | 20 +++++++ 3 files changed, 113 insertions(+) create mode 100644 scripts/db_enrichment/country_enrichment.py create mode 100644 scripts/db_enrichment/input_location.csv create mode 100644 scripts/db_enrichment/readme.md diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py new file mode 100644 index 0000000..8dcf5f2 --- /dev/null +++ b/scripts/db_enrichment/country_enrichment.py @@ -0,0 +1,91 @@ +import requests +import csv +from rdflib import Graph, Literal, RDF, URIRef +import time + + +sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'} + + +def callSPARQL(query): + payload = {'query': query, 'format': 'json'} + try: + r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders) + time.sleep(1) + # Slow process down, in case we did send too many processes. Sleep, then try again + if 'Retry_After' in r.headers: + print(r.headers) + time.sleep(45) + r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders) + + result = r.json()['results']['bindings'] + except Exception as e: + print("Error during SPARQL call. We abort the process and have to investigate") + print(r) + print(r.content) + print(r.url) + raise Exception(e) + return result + + +g = Graph() + + + +query = """ +construct { + ?a wdt:P625 ?c. + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label +} WHERE +{ + BIND (XXX as ?a) . + ?a wdt:P625 ?c. + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label + FILTER (lang(?continent_label)='en') + FILTER (lang(?country_label)='en') + FILTER (lang(?label)='en') + +} +""""" + +outputFile = 'input_location.csv' + +with open(outputFile, 'r') as csvfile: + spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') + counter=0 + for row in spamreader: + counter=counter+1 + + try: + tmpquery=query.replace("XXX", "<"+row[0]+">") + print(tmpquery) + + returnReply=callSPARQL(tmpquery) + print(returnReply) + + for row in returnReply: + print(row) + sub=URIRef(row['subject']['value']) + pred=URIRef(row['predicate']['value']) + + if row['object']['type'] == 'uri': + obj = URIRef(row['object']['value']) + + elif row['object']['type'] == 'literal': + obj= Literal(row['object']['value']) + g.add(( sub, pred, obj )) + + except Exception as e: + print(e) + raise + +print(g.serialize(format='n3').decode("utf-8")) +g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv new file mode 100644 index 0000000..364afc8 --- /dev/null +++ b/scripts/db_enrichment/input_location.csv @@ -0,0 +1,2 @@ +http://www.wikidata.org/entity/Q111904 +http://www.wikidata.org/entity/Q1070 \ No newline at end of file diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md new file mode 100644 index 0000000..83297dc --- /dev/null +++ b/scripts/db_enrichment/readme.md @@ -0,0 +1,20 @@ +We have two files in the folder *semantic_enrichment* that are used to enrich the identifier in our triples store with additional information, e.g. human readable labels and semantics (e.g. *What countries are summarizes as a continent*). This describes how to update these two files. + +### semantic_enrichment/labels.ttl +Static label about the ontology vocabulary terms we use. This file has to be updated manually. Use the OLS or bioportal to find more information about a used ontology term. + +### semantic_enrichment/countries.ttl +File containing information about the countries in our database. Additional information about countries are e.g. the label or GPS coordinates. We enricht the country identifier via wikidata. + +#### Update process +- What countries (=wikidata identifier) do we have to enrich? +This query retrieves all countries (ids) from our database that do not have a label yet: + +>SELECT DISTINCT ?geoLocation WHERE +>{ +>?fasta ?x [ ?geoLocation] . +>FILTER NOT EXISTS {?geoLocation ?geoLocation_tmp_label} +>} + +- Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder +- Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output) -- cgit v1.2.3