From d75f1c74fbf86652b02520de6ed46c981cf27e50 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 10:13:05 +0000 Subject: Adding Tennessee items --- scripts/db_enrichment/input_location.csv | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'scripts/db_enrichment/input_location.csv') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index 364afc8..eb5322a 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,2 +1,5 @@ -http://www.wikidata.org/entity/Q111904 -http://www.wikidata.org/entity/Q1070 \ No newline at end of file +http://www.wikidata.org/entity/Q3289517 +http://www.wikidata.org/entity/Q79663 +http://www.wikidata.org/entity/Q2145339 +http://www.wikidata.org/entity/Q23197 +http://www.wikidata.org/entity/Q494755 -- cgit v1.2.3 From 951ebe949d88cdbfed028e0a2a420ce7921c3919 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 10:31:56 +0000 Subject: Countries --- scripts/db_enrichment/input_location.csv | 21 ++++++--- scripts/uthsc_samples/uthsc_samples.py | 12 ++--- semantic_enrichment/countries.ttl | 75 ++++++++++++++++++++++++-------- 3 files changed, 80 insertions(+), 28 deletions(-) (limited to 'scripts/db_enrichment/input_location.csv') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index eb5322a..a4246cd 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,5 +1,16 @@ -http://www.wikidata.org/entity/Q3289517 -http://www.wikidata.org/entity/Q79663 -http://www.wikidata.org/entity/Q2145339 -http://www.wikidata.org/entity/Q23197 -http://www.wikidata.org/entity/Q494755 +http://www.wikidata.org/entity/Q7960498 +http://www.wikidata.org/entity/Q692895 +http://www.wikidata.org/entity/Q928 +http://www.wikidata.org/entity/Q2722074 +http://www.wikidata.org/entity/Q25622187 +http://www.wikidata.org/entity/Q27684996 +http://www.wikidata.org/entity/Q2757125 +http://www.wikidata.org/entity/Q1922283 +http://www.wikidata.org/entity/Q490 +http://www.wikidata.org/entity/Q677037 +http://www.wikidata.org/entity/Q3037 +http://www.wikidata.org/entity/Q843 +http://www.wikidata.org/entity/Q183 +http://www.wikidata.org/entity/Q29 +http://www.wikidata.org/entity/Q17 +http://www.wikidata.org/entity/Q810 diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py index c18c07a..3ad2561 100644 --- a/scripts/uthsc_samples/uthsc_samples.py +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -32,13 +32,13 @@ for index, row in table.iterrows(): sample_name = sample collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') locationx = row['City']+", "+row['State']+", USA" - location = "http://www.wikidata.org/enitity/Q16563" # Memphis by default + location = "http://www.wikidata.org/entity/Q16563" # Memphis by default map = { - "Pegram": "http://www.wikidata.org/enitity/Q3289517", - "Alexander": "http://www.wikidata.org/enitity/Q79663", - "Smithville": "http://www.wikidata.org/enitity/Q2145339", - "Nashville": "http://www.wikidata.org/enitity/Q23197", - "Madison": "http://www.wikidata.org/enitity/Q494755" + "Pegram": "http://www.wikidata.org/entity/Q3289517", + "Alexander": "http://www.wikidata.org/entity/Q79663", + "Smithville": "http://www.wikidata.org/entity/Q2145339", + "Nashville": "http://www.wikidata.org/entity/Q23197", + "Madison": "http://www.wikidata.org/entity/Q494755" } for name in map: diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl index fe50b16..728877f 100644 --- a/semantic_enrichment/countries.ttl +++ b/semantic_enrichment/countries.ttl @@ -220,7 +220,6 @@ ns1:P17 ; ns1:P625 "Point(-120.0 37.0)" . - rdfs:label "Brazil" ; ns1:P17 ; ns1:P30 ; @@ -1157,7 +1156,9 @@ ns1:P625 "Point(28.0 -14.0)" . rdfs:label "Mexico" ; - ns1:P30 . + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(-102.0 23.0)" . rdfs:label "Morocco" ; ns1:P17 ; @@ -1170,32 +1171,17 @@ ; ns1:P625 "Point(94.25 66.416666666)" . - rdfs:label "Germany" ; - ns1:P30 . - - rdfs:label "Spain" ; - ns1:P30 . - - rdfs:label "Italy" ; - ns1:P30 . - rdfs:label "Netherlands" ; ns1:P17 ; ns1:P30 , ; ns1:P625 "Point(5.55 52.316666666)" . - rdfs:label "Jordan" ; - ns1:P30 . - rdfs:label "Sierra Leone" ; ns1:P17 ; ns1:P30 ; ns1:P625 "Point(-11.916667 8.5)" . - rdfs:label "Japan" ; - ns1:P30 . - rdfs:label "Tunisia" ; ns1:P17 ; ns1:P30 ; @@ -1348,6 +1334,61 @@ ns1:P17 ; ns1:P625 "Point(-85.936008 32.933157)" . + rdfs:label "Mehsana" ; + ns1:P17 ; + ns1:P625 "Point(72.4 23.6)" . + + rdfs:label "Kathmandu" ; + ns1:P17 ; + ns1:P625 "Point(85.366666666 27.716666666)" . + + rdfs:label "Milan" ; + ns1:P17 ; + ns1:P625 "Point(9.19 45.466944444)" . + + rdfs:label "Telangana" ; + ns1:P17 ; + ns1:P625 "Point(79.59 17.99)" . + + rdfs:label "Philippines" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(123.0 12.0)" . + + rdfs:label "Mehsana" ; + ns1:P17 ; + ns1:P625 "Point(72.4 23.6)" . + + rdfs:label "Japan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(136.0 35.0)" . + + rdfs:label "Germany" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(10.0 51.0)" . + + rdfs:label "Spain" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(-3.5 40.2)" . + + rdfs:label "Jordan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(36.5 31.2)" . + + rdfs:label "Pakistan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(71.0 30.0)" . + + rdfs:label "Philippines" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(123.0 12.0)" . + rdfs:label "Oceania" . rdfs:label "North America" . rdfs:label "South America" . -- cgit v1.2.3 From 7c74a20b90ca647ca387eff2ed830c22f5ba1282 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 12:48:00 +0000 Subject: Country trouble shooting --- doc/INSTALL.md | 1 + scripts/db_enrichment/country_enrichment.py | 29 ++++++++++++++++++++++++++--- scripts/db_enrichment/input_location.csv | 10 ---------- scripts/db_enrichment/readme.md | 12 +++++++----- 4 files changed, 34 insertions(+), 18 deletions(-) (limited to 'scripts/db_enrichment/input_location.csv') diff --git a/doc/INSTALL.md b/doc/INSTALL.md index 45aca0f..367b452 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -77,6 +77,7 @@ Note: see above on GUIX_PACKAGE_PATH. ## Run country semantic enrichment script cd bh20-seq-resource/scripts/db_enrichment + edit input_location.csv guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py ## Run the tests diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index 1f99d42..f62a64e 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -39,14 +39,36 @@ def callSPARQL(query): g = Graph() +test_query=""" +# Use with https://query.wikidata.org/ +SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE { + BIND (XXX as ?a) . + OPTIONAL { + ?a wdt:P625 ?coor. + } + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + OPTIONAL { + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label + FILTER (lang(?continent_label)='en') + } + FILTER (lang(?country_label)='en') + FILTER (lang(?label)='en') +} +""" + +# wdt:P625 are GEO coordinates + query = """ construct { ?a wdt:P625 ?c. ?a rdfs:label ?label . ?a wdt:P17 ?country. ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. - ?continent rdfs:label ?continent_label + ?country wdt:P30 ?continent . + ?continent rdfs:label ?continent_label . } WHERE { BIND (XXX as ?a) . @@ -59,7 +81,6 @@ construct { FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') FILTER (lang(?label)='en') - } """"" @@ -72,6 +93,8 @@ with open(outputFile, 'r') as csvfile: counter=counter+1 try: + testq = test_query.replace("XXX", "<"+row[0]+">") + print(testq) tmpquery=query.replace("XXX", "<"+row[0]+">") print(tmpquery) diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index a4246cd..8c3308f 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,16 +1,6 @@ http://www.wikidata.org/entity/Q7960498 http://www.wikidata.org/entity/Q692895 -http://www.wikidata.org/entity/Q928 http://www.wikidata.org/entity/Q2722074 http://www.wikidata.org/entity/Q25622187 http://www.wikidata.org/entity/Q27684996 http://www.wikidata.org/entity/Q2757125 -http://www.wikidata.org/entity/Q1922283 -http://www.wikidata.org/entity/Q490 -http://www.wikidata.org/entity/Q677037 -http://www.wikidata.org/entity/Q3037 -http://www.wikidata.org/entity/Q843 -http://www.wikidata.org/entity/Q183 -http://www.wikidata.org/entity/Q29 -http://www.wikidata.org/entity/Q17 -http://www.wikidata.org/entity/Q810 diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md index 88e8be5..7539104 100644 --- a/scripts/db_enrichment/readme.md +++ b/scripts/db_enrichment/readme.md @@ -11,11 +11,13 @@ File containing information about the countries in our database. Additional info This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet: ->SELECT DISTINCT ?geoLocation WHERE ->{ ->?fasta ?x [ <> ?geoLocation] . ->FILTER NOT EXISTS {?geoLocation <> ?geoLocation_tmp_label} ->} +```sparql +SELECT DISTINCT ?geoLocation WHERE +{ + ?fasta ?x [ ?geoLocation] . + FILTER NOT EXISTS {?geoLocation ?geoLocation_tmp_label} +} +``` [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) -- cgit v1.2.3