diff options
author | lltommy | 2020-11-11 09:56:12 +0100 |
---|---|---|
committer | lltommy | 2020-11-11 09:56:12 +0100 |
commit | d6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch) | |
tree | 6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts/db_enrichment | |
parent | c5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff) | |
parent | c872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff) | |
download | bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip |
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
Diffstat (limited to 'scripts/db_enrichment')
-rw-r--r-- | scripts/db_enrichment/.gitignore | 1 | ||||
-rw-r--r-- | scripts/db_enrichment/country_enrichment.py | 64 | ||||
-rw-r--r-- | scripts/db_enrichment/input_location.csv | 8 | ||||
-rw-r--r-- | scripts/db_enrichment/readme.md | 14 |
4 files changed, 63 insertions, 24 deletions
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore new file mode 100644 index 0000000..30b159b --- /dev/null +++ b/scripts/db_enrichment/.gitignore @@ -0,0 +1 @@ +enriched_output.txt diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index 8dcf5f2..37329fb 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -1,3 +1,12 @@ +# This script by @LLTommy queries the main SPARQL end point to find what +# collections are missing country information for GPS coordinates, such +# +# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ; +# ns1:P17 <http://www.wikidata.org/entity/Q664> ; +# ns1:P625 "Point(172.0 -43.6)" . +# +# See also the ./readme.md + import requests import csv from rdflib import Graph, Literal, RDF, URIRef @@ -30,30 +39,49 @@ def callSPARQL(query): g = Graph() +test_query=""" +# Use with https://query.wikidata.org/ +SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE { + BIND (XXX as ?a) . + OPTIONAL { + ?a wdt:P625 ?coor. + } + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + OPTIONAL { + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label + FILTER (lang(?continent_label)='en') + } + FILTER (lang(?country_label)='en') + FILTER (lang(?label)='en') +} +""" +# wdt:P625 are GEO coordinates query = """ construct { - ?a wdt:P625 ?c. + ?a wdt:P625 ?c. ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. - ?continent rdfs:label ?continent_label -} WHERE -{ - BIND (XXX as ?a) . - ?a wdt:P625 ?c. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent . + ?continent rdfs:label ?continent_label . +} WHERE +{ + BIND (XXX as ?a) . ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. ?continent rdfs:label ?continent_label - FILTER (lang(?continent_label)='en') + FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') - FILTER (lang(?label)='en') - -} + FILTER (lang(?label)='en') + OPTIONAL { ?a wdt:P625 ?c } +} """"" outputFile = 'input_location.csv' @@ -65,6 +93,8 @@ with open(outputFile, 'r') as csvfile: counter=counter+1 try: + testq = test_query.replace("XXX", "<"+row[0]+">") + print(testq) tmpquery=query.replace("XXX", "<"+row[0]+">") print(tmpquery) @@ -88,4 +118,4 @@ with open(outputFile, 'r') as csvfile: raise print(g.serialize(format='n3').decode("utf-8")) -g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file +g.serialize(destination='enriched_output.txt', format='turtle') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index 364afc8..8c3308f 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,2 +1,6 @@ -http://www.wikidata.org/entity/Q111904 -http://www.wikidata.org/entity/Q1070
\ No newline at end of file +http://www.wikidata.org/entity/Q7960498 +http://www.wikidata.org/entity/Q692895 +http://www.wikidata.org/entity/Q2722074 +http://www.wikidata.org/entity/Q25622187 +http://www.wikidata.org/entity/Q27684996 +http://www.wikidata.org/entity/Q2757125 diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md index 55ec496..7539104 100644 --- a/scripts/db_enrichment/readme.md +++ b/scripts/db_enrichment/readme.md @@ -11,11 +11,15 @@ File containing information about the countries in our database. Additional info This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet: ->SELECT DISTINCT ?geoLocation WHERE ->{ ->?fasta ?x [ <<http://purl.obolibrary.org/obo/GAZ_00000448>> ?geoLocation] . ->FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label} ->} +```sparql +SELECT DISTINCT ?geoLocation WHERE +{ + ?fasta ?x [ <http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] . + FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label} +} +``` + +[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output) |