Merge branch 'master' of https://github.com/arvados/bh20-seq-resource

author: lltommy 2020-11-11 09:56:12 +0100
committer: lltommy 2020-11-11 09:56:12 +0100
commit: d6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree: 6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts/db_enrichment
parent: c5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parent: c872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
download: bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip
4 files changed, 63 insertions, 24 deletions
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore
new file mode 100644
index 0000000..30b159b
--- /dev/null
+++ b/scripts/db_enrichment/.gitignore
@@ -0,0 +1 @@
+enriched_output.txt
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..37329fb 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+#    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+#    ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
 import requests
 import csv
 from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,49 @@ def callSPARQL(query):
 
 g = Graph()
 
+test_query="""
+# Use with https://query.wikidata.org/
+SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
+    BIND (XXX as ?a) .
+    OPTIONAL {
+        ?a wdt:P625 ?coor.
+    }
+    ?a rdfs:label ?label .
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    OPTIONAL {
+        ?country wdt:P30 ?continent.
+        ?continent rdfs:label ?continent_label
+        FILTER (lang(?continent_label)='en')
+    }
+    FILTER (lang(?country_label)='en')
+    FILTER (lang(?label)='en')
+}
+"""
 
+# wdt:P625 are GEO coordinates
 
 query = """
 construct {
-    ?a wdt:P625 ?c. 
+    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label . 
-    ?country wdt:P30 ?continent. 
-    ?continent rdfs:label ?continent_label   
-} WHERE 
-{ 
-    BIND (XXX as ?a) . 
-    ?a wdt:P625 ?c. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent .
+    ?continent rdfs:label ?continent_label .
+} WHERE
+{
+    BIND (XXX as ?a) .
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label .    
-    ?country wdt:P30 ?continent. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent.
     ?continent rdfs:label ?continent_label
-    FILTER (lang(?continent_label)='en')           
+    FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
-    FILTER (lang(?label)='en') 
-
-}  
+    FILTER (lang(?label)='en')
+    OPTIONAL { ?a wdt:P625 ?c }
+}
 """""
 
 outputFile = 'input_location.csv'
@@ -65,6 +93,8 @@ with open(outputFile, 'r') as csvfile:
         counter=counter+1
 
         try:
+            testq = test_query.replace("XXX", "<"+row[0]+">")
+            print(testq)
             tmpquery=query.replace("XXX", "<"+row[0]+">")
             print(tmpquery)
 
@@ -88,4 +118,4 @@ with open(outputFile, 'r') as csvfile:
             raise
 
 print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index 364afc8..8c3308f 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,2 +1,6 @@
-http://www.wikidata.org/entity/Q111904
-http://www.wikidata.org/entity/Q1070
\ No newline at end of file
+http://www.wikidata.org/entity/Q7960498
+http://www.wikidata.org/entity/Q692895
+http://www.wikidata.org/entity/Q2722074
+http://www.wikidata.org/entity/Q25622187
+http://www.wikidata.org/entity/Q27684996
+http://www.wikidata.org/entity/Q2757125
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 55ec496..7539104 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -11,11 +11,15 @@ File containing information about the countries in our database. Additional info
 This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet:
 
 
->SELECT DISTINCT ?geoLocation  WHERE
->{
->?fasta ?x [ <<http://purl.obolibrary.org/obo/GAZ_00000448>> ?geoLocation] .
->FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
->}
+```sparql
+SELECT DISTINCT ?geoLocation  WHERE
+{
+    ?fasta ?x [ <http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+    FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+}
+```
+
+[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
 
 - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
 - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
author	lltommy	2020-11-11 09:56:12 +0100
committer	lltommy	2020-11-11 09:56:12 +0100
commit	d6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree	6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts/db_enrichment
parent	c5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parent	c872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
download	bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip