Adding script supporting semantic enrichment

author: lltommy 2020-09-27 12:56:27 +0200
committer: lltommy 2020-09-27 12:56:27 +0200
commit: 71820c89e06b7d028ffdf76ddf01141733c78388 (patch)
tree: f7b99454ca5912c4edcafbf7fdbe445b214337a1
parent: 7afd6b778b0deade1bf70062a10041e31a249af0 (diff)
download: bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.gz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.lz
bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.zip
3 files changed, 113 insertions, 0 deletions
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
new file mode 100644
index 0000000..8dcf5f2
--- /dev/null
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -0,0 +1,91 @@
+import requests
+import csv
+from rdflib import Graph, Literal, RDF, URIRef
+import time
+
+
+sparqlHeaders={'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0'}
+
+
+def callSPARQL(query):
+    payload = {'query': query, 'format': 'json'}
+    try:
+        r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+        time.sleep(1)
+        # Slow process down, in case we did send too many processes. Sleep, then try again
+        if 'Retry_After' in r.headers:
+            print(r.headers)
+            time.sleep(45)
+            r = requests.get('https://query.wikidata.org/sparql', params=payload, headers=sparqlHeaders)
+
+        result = r.json()['results']['bindings']
+    except Exception as e:
+        print("Error during SPARQL call. We abort the process and have to investigate")
+        print(r)
+        print(r.content)
+        print(r.url)
+        raise Exception(e)
+    return result
+
+
+g = Graph()
+
+
+
+query = """
+construct {
+    ?a wdt:P625 ?c. 
+    ?a rdfs:label ?label .
+    ?a wdt:P17 ?country.      
+    ?country rdfs:label ?country_label . 
+    ?country wdt:P30 ?continent. 
+    ?continent rdfs:label ?continent_label   
+} WHERE 
+{ 
+    BIND (XXX as ?a) . 
+    ?a wdt:P625 ?c. 
+    ?a rdfs:label ?label .
+    ?a wdt:P17 ?country.      
+    ?country rdfs:label ?country_label .    
+    ?country wdt:P30 ?continent. 
+    ?continent rdfs:label ?continent_label
+    FILTER (lang(?continent_label)='en')           
+    FILTER (lang(?country_label)='en')
+    FILTER (lang(?label)='en') 
+
+}  
+"""""
+
+outputFile = 'input_location.csv'
+
+with open(outputFile, 'r') as csvfile:
+    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
+    counter=0
+    for row in spamreader:
+        counter=counter+1
+
+        try:
+            tmpquery=query.replace("XXX", "<"+row[0]+">")
+            print(tmpquery)
+
+            returnReply=callSPARQL(tmpquery)
+            print(returnReply)
+
+            for row in returnReply:
+                print(row)
+                sub=URIRef(row['subject']['value'])
+                pred=URIRef(row['predicate']['value'])
+
+                if row['object']['type'] == 'uri':
+                    obj =  URIRef(row['object']['value'])
+
+                elif row['object']['type'] == 'literal':
+                    obj= Literal(row['object']['value'])
+                g.add(( sub, pred, obj ))
+
+        except Exception as e:
+            print(e)
+            raise
+
+print(g.serialize(format='n3').decode("utf-8"))
+g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
new file mode 100644
index 0000000..364afc8
--- /dev/null
+++ b/scripts/db_enrichment/input_location.csv
@@ -0,0 +1,2 @@
+http://www.wikidata.org/entity/Q111904
+http://www.wikidata.org/entity/Q1070
\ No newline at end of file
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
new file mode 100644
index 0000000..83297dc
--- /dev/null
+++ b/scripts/db_enrichment/readme.md
@@ -0,0 +1,20 @@
+We have two files in the folder *semantic_enrichment* that are used to enrich the identifier in our triples store with additional information, e.g. human readable labels and semantics (e.g. *What countries are summarizes as a continent*). This describes how to update these two files.
+
+### semantic_enrichment/labels.ttl
+Static label about the ontology vocabulary terms we use. This file has to be updated manually. Use the OLS or bioportal to find more information about a used ontology term.
+
+### semantic_enrichment/countries.ttl
+File containing information about the countries in our database. Additional information about countries are e.g. the label or GPS coordinates. We enricht the country identifier via wikidata.
+
+#### Update process
+- What countries (=wikidata identifier) do we have to enrich?
+This query retrieves all countries (ids) from our database that do not have a label yet:
+
+>SELECT DISTINCT ?geoLocation  WHERE
+>{
+>?fasta ?x [<http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+>FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+>}
+
+- Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
+- Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
author	lltommy	2020-09-27 12:56:27 +0200
committer	lltommy	2020-09-27 12:56:27 +0200
commit	71820c89e06b7d028ffdf76ddf01141733c78388 (patch)
tree	f7b99454ca5912c4edcafbf7fdbe445b214337a1
parent	7afd6b778b0deade1bf70062a10041e31a249af0 (diff)
download	bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.gz bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.tar.lz bh20-seq-resource-71820c89e06b7d028ffdf76ddf01141733c78388.zip