diff options
-rw-r--r-- | doc/INSTALL.md | 5 | ||||
-rw-r--r-- | scripts/db_enrichment/.gitignore | 1 | ||||
-rw-r--r-- | scripts/db_enrichment/country_enrichment.py | 43 | ||||
-rw-r--r-- | scripts/db_enrichment/input_location.csv | 7 | ||||
-rw-r--r-- | scripts/db_enrichment/readme.md | 2 | ||||
-rw-r--r-- | semantic_enrichment/countries.ttl | 18 |
6 files changed, 56 insertions, 20 deletions
diff --git a/doc/INSTALL.md b/doc/INSTALL.md index 96cf1d4..f54c8f2 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/i Note: see above on GUIX_PACKAGE_PATH. +## Run country semantic enrichment script + + cd bh20-seq-resource/scripts/db_enrichment + guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py + ## Run the tests guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore new file mode 100644 index 0000000..30b159b --- /dev/null +++ b/scripts/db_enrichment/.gitignore @@ -0,0 +1 @@ +enriched_output.txt diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index 8dcf5f2..1f99d42 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -1,3 +1,12 @@ +# This script by @LLTommy queries the main SPARQL end point to find what +# collections are missing country information for GPS coordinates, such +# +# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ; +# ns1:P17 <http://www.wikidata.org/entity/Q664> ; +# ns1:P625 "Point(172.0 -43.6)" . +# +# See also the ./readme.md + import requests import csv from rdflib import Graph, Literal, RDF, URIRef @@ -30,30 +39,28 @@ def callSPARQL(query): g = Graph() - - query = """ construct { - ?a wdt:P625 ?c. + ?a wdt:P625 ?c. ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. - ?continent rdfs:label ?continent_label -} WHERE -{ - BIND (XXX as ?a) . - ?a wdt:P625 ?c. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label +} WHERE +{ + BIND (XXX as ?a) . + ?a wdt:P625 ?c. ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. ?continent rdfs:label ?continent_label - FILTER (lang(?continent_label)='en') + FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') - FILTER (lang(?label)='en') + FILTER (lang(?label)='en') -} +} """"" outputFile = 'input_location.csv' @@ -88,4 +95,4 @@ with open(outputFile, 'r') as csvfile: raise print(g.serialize(format='n3').decode("utf-8")) -g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file +g.serialize(destination='enriched_output.txt', format='turtle') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index 364afc8..eb5322a 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,2 +1,5 @@ -http://www.wikidata.org/entity/Q111904 -http://www.wikidata.org/entity/Q1070
\ No newline at end of file +http://www.wikidata.org/entity/Q3289517 +http://www.wikidata.org/entity/Q79663 +http://www.wikidata.org/entity/Q2145339 +http://www.wikidata.org/entity/Q23197 +http://www.wikidata.org/entity/Q494755 diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md index 55ec496..88e8be5 100644 --- a/scripts/db_enrichment/readme.md +++ b/scripts/db_enrichment/readme.md @@ -17,5 +17,7 @@ This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countrie >FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label} >} +[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) + - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output) diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl index 08e9c38..fe50b16 100644 --- a/semantic_enrichment/countries.ttl +++ b/semantic_enrichment/countries.ttl @@ -1328,7 +1328,25 @@ ns1:P17 <http://www.wikidata.org/entity/Q79> ; ns1:P625 "Point(31.239444444 30.056111111)" . +<http://www.wikidata.org/entity/Q2145339> rdfs:label "Smithville" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-85.820833333 35.957222222)" . + +<http://www.wikidata.org/entity/Q23197> rdfs:label "Nashville" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-86.783888888 36.165)" . + +<http://www.wikidata.org/entity/Q3289517> rdfs:label "Pegram" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-87.051666666 36.101666666)" . + +<http://www.wikidata.org/entity/Q494755> rdfs:label "Madison County" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-88.84 35.61)" . +<http://www.wikidata.org/entity/Q79663> rdfs:label "Alexander City" ; + ns1:P17 <http://www.wikidata.org/entity/Q30> ; + ns1:P625 "Point(-85.936008 32.933157)" . <http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" . <http://www.wikidata.org/entity/Q49> rdfs:label "North America" . |