aboutsummaryrefslogtreecommitdiff
path: root/scripts/db_enrichment/country_enrichment.py
diff options
context:
space:
mode:
authorlltommy2020-11-11 09:56:12 +0100
committerlltommy2020-11-11 09:56:12 +0100
commitd6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts/db_enrichment/country_enrichment.py
parentc5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parentc872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
downloadbh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
Diffstat (limited to 'scripts/db_enrichment/country_enrichment.py')
-rw-r--r--scripts/db_enrichment/country_enrichment.py64
1 files changed, 47 insertions, 17 deletions
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..37329fb 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+# ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+# ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
import requests
import csv
from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,49 @@ def callSPARQL(query):
g = Graph()
+test_query="""
+# Use with https://query.wikidata.org/
+SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
+ BIND (XXX as ?a) .
+ OPTIONAL {
+ ?a wdt:P625 ?coor.
+ }
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ OPTIONAL {
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+ FILTER (lang(?continent_label)='en')
+ }
+ FILTER (lang(?country_label)='en')
+ FILTER (lang(?label)='en')
+}
+"""
+# wdt:P625 are GEO coordinates
query = """
construct {
- ?a wdt:P625 ?c.
+ ?a wdt:P625 ?c.
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
- ?continent rdfs:label ?continent_label
-} WHERE
-{
- BIND (XXX as ?a) .
- ?a wdt:P625 ?c.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent .
+ ?continent rdfs:label ?continent_label .
+} WHERE
+{
+ BIND (XXX as ?a) .
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
?continent rdfs:label ?continent_label
- FILTER (lang(?continent_label)='en')
+ FILTER (lang(?continent_label)='en')
FILTER (lang(?country_label)='en')
- FILTER (lang(?label)='en')
-
-}
+ FILTER (lang(?label)='en')
+ OPTIONAL { ?a wdt:P625 ?c }
+}
"""""
outputFile = 'input_location.csv'
@@ -65,6 +93,8 @@ with open(outputFile, 'r') as csvfile:
counter=counter+1
try:
+ testq = test_query.replace("XXX", "<"+row[0]+">")
+ print(testq)
tmpquery=query.replace("XXX", "<"+row[0]+">")
print(tmpquery)
@@ -88,4 +118,4 @@ with open(outputFile, 'r') as csvfile:
raise
print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')