aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-11-06 10:13:05 +0000
committerPjotr Prins2020-11-06 10:13:05 +0000
commitd75f1c74fbf86652b02520de6ed46c981cf27e50 (patch)
tree6c2744b7830453c20292b83d9aa0e9245abee4c2
parent43d7264dda8061a024befbc9ca0a89d7159b1e40 (diff)
downloadbh20-seq-resource-d75f1c74fbf86652b02520de6ed46c981cf27e50.tar.gz
bh20-seq-resource-d75f1c74fbf86652b02520de6ed46c981cf27e50.tar.lz
bh20-seq-resource-d75f1c74fbf86652b02520de6ed46c981cf27e50.zip
Adding Tennessee items
-rw-r--r--doc/INSTALL.md5
-rw-r--r--scripts/db_enrichment/.gitignore1
-rw-r--r--scripts/db_enrichment/country_enrichment.py43
-rw-r--r--scripts/db_enrichment/input_location.csv7
-rw-r--r--scripts/db_enrichment/readme.md2
-rw-r--r--semantic_enrichment/countries.ttl18
6 files changed, 56 insertions, 20 deletions
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 96cf1d4..f54c8f2 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/i
Note: see above on GUIX_PACKAGE_PATH.
+## Run country semantic enrichment script
+
+ cd bh20-seq-resource/scripts/db_enrichment
+ guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py
+
## Run the tests
guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore
new file mode 100644
index 0000000..30b159b
--- /dev/null
+++ b/scripts/db_enrichment/.gitignore
@@ -0,0 +1 @@
+enriched_output.txt
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..1f99d42 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+# ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+# ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
import requests
import csv
from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,28 @@ def callSPARQL(query):
g = Graph()
-
-
query = """
construct {
- ?a wdt:P625 ?c.
+ ?a wdt:P625 ?c.
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
- ?continent rdfs:label ?continent_label
-} WHERE
-{
- BIND (XXX as ?a) .
- ?a wdt:P625 ?c.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+} WHERE
+{
+ BIND (XXX as ?a) .
+ ?a wdt:P625 ?c.
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
?continent rdfs:label ?continent_label
- FILTER (lang(?continent_label)='en')
+ FILTER (lang(?continent_label)='en')
FILTER (lang(?country_label)='en')
- FILTER (lang(?label)='en')
+ FILTER (lang(?label)='en')
-}
+}
"""""
outputFile = 'input_location.csv'
@@ -88,4 +95,4 @@ with open(outputFile, 'r') as csvfile:
raise
print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index 364afc8..eb5322a 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,2 +1,5 @@
-http://www.wikidata.org/entity/Q111904
-http://www.wikidata.org/entity/Q1070 \ No newline at end of file
+http://www.wikidata.org/entity/Q3289517
+http://www.wikidata.org/entity/Q79663
+http://www.wikidata.org/entity/Q2145339
+http://www.wikidata.org/entity/Q23197
+http://www.wikidata.org/entity/Q494755
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 55ec496..88e8be5 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -17,5 +17,7 @@ This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countrie
>FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
>}
+[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
+
- Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
- Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
index 08e9c38..fe50b16 100644
--- a/semantic_enrichment/countries.ttl
+++ b/semantic_enrichment/countries.ttl
@@ -1328,7 +1328,25 @@
ns1:P17 <http://www.wikidata.org/entity/Q79> ;
ns1:P625 "Point(31.239444444 30.056111111)" .
+<http://www.wikidata.org/entity/Q2145339> rdfs:label "Smithville" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-85.820833333 35.957222222)" .
+
+<http://www.wikidata.org/entity/Q23197> rdfs:label "Nashville" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-86.783888888 36.165)" .
+
+<http://www.wikidata.org/entity/Q3289517> rdfs:label "Pegram" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-87.051666666 36.101666666)" .
+
+<http://www.wikidata.org/entity/Q494755> rdfs:label "Madison County" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-88.84 35.61)" .
+<http://www.wikidata.org/entity/Q79663> rdfs:label "Alexander City" ;
+ ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+ ns1:P625 "Point(-85.936008 32.933157)" .
<http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" .
<http://www.wikidata.org/entity/Q49> rdfs:label "North America" .