aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlltommy2020-09-27 12:56:27 +0200
committerlltommy2020-09-27 12:56:27 +0200
commit71820c89e06b7d028ffdf76ddf01141733c78388 (patch)
treef7b99454ca5912c4edcafbf7fdbe445b214337a1
parent7afd6b778b0deade1bf70062a10041e31a249af0 (diff)
downloadbh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.gz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.lz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.zip
Adding script supporting semantic enrichment
-rw-r--r--scripts/db_enrichment/country_enrichment.py91
-rw-r--r--scripts/db_enrichment/input_location.csv2
-rw-r--r--scripts/db_enrichment/readme.md20
3 files changed, 113 insertions, 0 deletions
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
new file mode 100644
index 0000000..8dcf5f2
--- /dev/null
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -0,0 +1,91 @@
+import requests
+import csv
+from rdflib import Graph, Literal, RDF, URIRef
+import time
+
+
+sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'}
+
+
+def callSPARQL(query):
+ payload = {'query': query, 'format': 'json'}
+ try:
+ r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+ time.sleep(1)
+ # Slow process down, in case we did send too many processes. Sleep, then try again
+ if 'Retry_After' in r.headers:
+ print(r.headers)
+ time.sleep(45)
+ r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+
+ result = r.json()['results']['bindings']
+ except Exception as e:
+ print("Error during SPARQL call. We abort the process and have to investigate")
+ print(r)
+ print(r.content)
+ print(r.url)
+ raise Exception(e)
+ return result
+
+
+g = Graph()
+
+
+
+query = """
+construct {
+ ?a wdt:P625 ?c.
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+} WHERE
+{
+ BIND (XXX as ?a) .
+ ?a wdt:P625 ?c.
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+ FILTER (lang(?continent_label)='en')
+ FILTER (lang(?country_label)='en')
+ FILTER (lang(?label)='en')
+
+}
+"""""
+
+outputFile = 'input_location.csv'
+
+with open(outputFile, 'r') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
+ counter=0
+ for row in spamreader:
+ counter=counter+1
+
+ try:
+ tmpquery=query.replace("XXX", "<"+row[0]+">")
+ print(tmpquery)
+
+ returnReply=callSPARQL(tmpquery)
+ print(returnReply)
+
+ for row in returnReply:
+ print(row)
+ sub=URIRef(row['subject']['value'])
+ pred=URIRef(row['predicate']['value'])
+
+ if row['object']['type'] == 'uri':
+ obj = URIRef(row['object']['value'])
+
+ elif row['object']['type'] == 'literal':
+ obj= Literal(row['object']['value'])
+ g.add(( sub, pred, obj ))
+
+ except Exception as e:
+ print(e)
+ raise
+
+print(g.serialize(format='n3').decode("utf-8"))
+g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
new file mode 100644
index 0000000..364afc8
--- /dev/null
+++ b/scripts/db_enrichment/input_location.csv
@@ -0,0 +1,2 @@
+http://www.wikidata.org/entity/Q111904
+http://www.wikidata.org/entity/Q1070 \ No newline at end of file
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
new file mode 100644
index 0000000..83297dc
--- /dev/null
+++ b/scripts/db_enrichment/readme.md
@@ -0,0 +1,20 @@
+We have two files in the folder *semantic_enrichment* that are used to enrich the identifier in our triples store with additional information, e.g. human readable labels and semantics (e.g. *What countries are summarizes as a continent*). This describes how to update these two files.
+
+### semantic_enrichment/labels.ttl
+Static label about the ontology vocabulary terms we use. This file has to be updated manually. Use the OLS or bioportal to find more information about a used ontology term.
+
+### semantic_enrichment/countries.ttl
+File containing information about the countries in our database. Additional information about countries are e.g. the label or GPS coordinates. We enricht the country identifier via wikidata.
+
+#### Update process
+- What countries (=wikidata identifier) do we have to enrich?
+This query retrieves all countries (ids) from our database that do not have a label yet:
+
+>SELECT DISTINCT ?geoLocation WHERE
+>{
+>?fasta ?x [<http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+>FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+>}
+
+- Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
+- Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)