From 43d7264dda8061a024befbc9ca0a89d7159b1e40 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 09:52:32 +0000 Subject: UTHSC upload info --- scripts/uthsc_samples/.gitignore | 1 + scripts/uthsc_samples/template.yaml | 35 +++++++++++++++++++++ scripts/uthsc_samples/uthsc_samples.py | 57 ++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 scripts/uthsc_samples/.gitignore create mode 100644 scripts/uthsc_samples/template.yaml create mode 100644 scripts/uthsc_samples/uthsc_samples.py (limited to 'scripts') diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore new file mode 100644 index 0000000..8786e3f --- /dev/null +++ b/scripts/uthsc_samples/.gitignore @@ -0,0 +1 @@ +yaml diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml new file mode 100644 index 0000000..1175ac8 --- /dev/null +++ b/scripts/uthsc_samples/template.yaml @@ -0,0 +1,35 @@ +id: placeholder + +license: + license_type: http://creativecommons.org/licenses/by/4.0/ + title: "$sample_name - $locationx" + attribution_name: "Mariah Taylor, Colleen Jonsson" + attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php + +host: + host_id: "$sample_id" + host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 + +sample: + sample_id: "$sample_id" + specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] + collection_date: "$collection_date" + collection_location: $location + +virus: + virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 + virus_strain: "$strain" + +technology: + sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] + sequence_assembly_method: https://bio.tools/BWA#! + additional_technology_information: Oxford Nanopore MiniIon RNA long reads + +submitter: + authors: [Mariah Taylor, Colleen Jonsson] + submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins] + submitter_address: UTHSC, Memphis, Tennessee 38163, USA + originating_lab: Regional Biocontainment Laboratory, Memphis, TN + provider_sample_id: $sample_id + submitter_sample_id: $sample_id + submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162] diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py new file mode 100644 index 0000000..5c39398 --- /dev/null +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -0,0 +1,57 @@ +import os +import pandas as pd +from string import Template +from dateutil.parser import parse +import re + +import sys + +# Metadata in tabular format in a spreadsheet(?!) +xlsx = '../../test/data/10_samples.xlsx' + +# Template in a text file +template_yaml = 'template.yaml' + +dir_output = 'yaml' + +if not os.path.exists(dir_output): + os.makedirs(dir_output) + +table = pd.read_excel(xlsx) + +print(table) + +for index, row in table.iterrows(): + sample = row['Sample ID'] + print(f"Processing sample {sample}...") + + with open(template_yaml) as f: + text = Template(f.read()) + with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw: + sample_id = sample + sample_name = sample + collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') + locationx = row['City']+", "+row['State']+", USA" + location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default + map = { + "Pegram": "https://www.wikidata.org/wiki/Q3289517", + "Alexander": "https://www.wikidata.org/wiki/Q79663", + "Smithville": "https://www.wikidata.org/wiki/Q2145339", + "Nashville": "https://www.wikidata.org/wiki/Q23197", + "Madison": "https://www.wikidata.org/wiki/Q494755" + } + + for name in map: + p = re.compile(name) + if p.match(locationx): + location = map[name] + break + + strain = f"SARS-CoV-2/human/USA/{sample}/2020" + fw.write(text.substitute(sample_id=sample_id, + sample_name=sample_name, + collection_date=collection_date, + location=location, + locationx=locationx, + strain=strain + )) -- cgit v1.2.3 From d75f1c74fbf86652b02520de6ed46c981cf27e50 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 10:13:05 +0000 Subject: Adding Tennessee items --- doc/INSTALL.md | 5 ++++ scripts/db_enrichment/.gitignore | 1 + scripts/db_enrichment/country_enrichment.py | 43 +++++++++++++++++------------ scripts/db_enrichment/input_location.csv | 7 +++-- scripts/db_enrichment/readme.md | 2 ++ semantic_enrichment/countries.ttl | 18 ++++++++++++ 6 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 scripts/db_enrichment/.gitignore (limited to 'scripts') diff --git a/doc/INSTALL.md b/doc/INSTALL.md index 96cf1d4..f54c8f2 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/i Note: see above on GUIX_PACKAGE_PATH. +## Run country semantic enrichment script + + cd bh20-seq-resource/scripts/db_enrichment + guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py + ## Run the tests guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore new file mode 100644 index 0000000..30b159b --- /dev/null +++ b/scripts/db_enrichment/.gitignore @@ -0,0 +1 @@ +enriched_output.txt diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index 8dcf5f2..1f99d42 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -1,3 +1,12 @@ +# This script by @LLTommy queries the main SPARQL end point to find what +# collections are missing country information for GPS coordinates, such +# +# rdfs:label "Canterbury Region" ; +# ns1:P17 ; +# ns1:P625 "Point(172.0 -43.6)" . +# +# See also the ./readme.md + import requests import csv from rdflib import Graph, Literal, RDF, URIRef @@ -30,30 +39,28 @@ def callSPARQL(query): g = Graph() - - query = """ construct { - ?a wdt:P625 ?c. + ?a wdt:P625 ?c. ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. - ?continent rdfs:label ?continent_label -} WHERE -{ - BIND (XXX as ?a) . - ?a wdt:P625 ?c. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label +} WHERE +{ + BIND (XXX as ?a) . + ?a wdt:P625 ?c. ?a rdfs:label ?label . - ?a wdt:P17 ?country. - ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + ?country wdt:P30 ?continent. ?continent rdfs:label ?continent_label - FILTER (lang(?continent_label)='en') + FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') - FILTER (lang(?label)='en') + FILTER (lang(?label)='en') -} +} """"" outputFile = 'input_location.csv' @@ -88,4 +95,4 @@ with open(outputFile, 'r') as csvfile: raise print(g.serialize(format='n3').decode("utf-8")) -g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file +g.serialize(destination='enriched_output.txt', format='turtle') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index 364afc8..eb5322a 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,2 +1,5 @@ -http://www.wikidata.org/entity/Q111904 -http://www.wikidata.org/entity/Q1070 \ No newline at end of file +http://www.wikidata.org/entity/Q3289517 +http://www.wikidata.org/entity/Q79663 +http://www.wikidata.org/entity/Q2145339 +http://www.wikidata.org/entity/Q23197 +http://www.wikidata.org/entity/Q494755 diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md index 55ec496..88e8be5 100644 --- a/scripts/db_enrichment/readme.md +++ b/scripts/db_enrichment/readme.md @@ -17,5 +17,7 @@ This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countrie >FILTER NOT EXISTS {?geoLocation <> ?geoLocation_tmp_label} >} +[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) + - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output) diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl index 08e9c38..fe50b16 100644 --- a/semantic_enrichment/countries.ttl +++ b/semantic_enrichment/countries.ttl @@ -1328,7 +1328,25 @@ ns1:P17 ; ns1:P625 "Point(31.239444444 30.056111111)" . + rdfs:label "Smithville" ; + ns1:P17 ; + ns1:P625 "Point(-85.820833333 35.957222222)" . + + rdfs:label "Nashville" ; + ns1:P17 ; + ns1:P625 "Point(-86.783888888 36.165)" . + + rdfs:label "Pegram" ; + ns1:P17 ; + ns1:P625 "Point(-87.051666666 36.101666666)" . + + rdfs:label "Madison County" ; + ns1:P17 ; + ns1:P625 "Point(-88.84 35.61)" . + rdfs:label "Alexander City" ; + ns1:P17 ; + ns1:P625 "Point(-85.936008 32.933157)" . rdfs:label "Oceania" . rdfs:label "North America" . -- cgit v1.2.3 From 039ad90b1627464b687adddb08cf489dca3c5fbc Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 10:14:21 +0000 Subject: Using correct wikidata geo link --- scripts/uthsc_samples/uthsc_samples.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'scripts') diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py index 5c39398..c18c07a 100644 --- a/scripts/uthsc_samples/uthsc_samples.py +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -32,13 +32,13 @@ for index, row in table.iterrows(): sample_name = sample collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') locationx = row['City']+", "+row['State']+", USA" - location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default + location = "http://www.wikidata.org/enitity/Q16563" # Memphis by default map = { - "Pegram": "https://www.wikidata.org/wiki/Q3289517", - "Alexander": "https://www.wikidata.org/wiki/Q79663", - "Smithville": "https://www.wikidata.org/wiki/Q2145339", - "Nashville": "https://www.wikidata.org/wiki/Q23197", - "Madison": "https://www.wikidata.org/wiki/Q494755" + "Pegram": "http://www.wikidata.org/enitity/Q3289517", + "Alexander": "http://www.wikidata.org/enitity/Q79663", + "Smithville": "http://www.wikidata.org/enitity/Q2145339", + "Nashville": "http://www.wikidata.org/enitity/Q23197", + "Madison": "http://www.wikidata.org/enitity/Q494755" } for name in map: -- cgit v1.2.3 From 951ebe949d88cdbfed028e0a2a420ce7921c3919 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 10:31:56 +0000 Subject: Countries --- scripts/db_enrichment/input_location.csv | 21 ++++++--- scripts/uthsc_samples/uthsc_samples.py | 12 ++--- semantic_enrichment/countries.ttl | 75 ++++++++++++++++++++++++-------- 3 files changed, 80 insertions(+), 28 deletions(-) (limited to 'scripts') diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index eb5322a..a4246cd 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,5 +1,16 @@ -http://www.wikidata.org/entity/Q3289517 -http://www.wikidata.org/entity/Q79663 -http://www.wikidata.org/entity/Q2145339 -http://www.wikidata.org/entity/Q23197 -http://www.wikidata.org/entity/Q494755 +http://www.wikidata.org/entity/Q7960498 +http://www.wikidata.org/entity/Q692895 +http://www.wikidata.org/entity/Q928 +http://www.wikidata.org/entity/Q2722074 +http://www.wikidata.org/entity/Q25622187 +http://www.wikidata.org/entity/Q27684996 +http://www.wikidata.org/entity/Q2757125 +http://www.wikidata.org/entity/Q1922283 +http://www.wikidata.org/entity/Q490 +http://www.wikidata.org/entity/Q677037 +http://www.wikidata.org/entity/Q3037 +http://www.wikidata.org/entity/Q843 +http://www.wikidata.org/entity/Q183 +http://www.wikidata.org/entity/Q29 +http://www.wikidata.org/entity/Q17 +http://www.wikidata.org/entity/Q810 diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py index c18c07a..3ad2561 100644 --- a/scripts/uthsc_samples/uthsc_samples.py +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -32,13 +32,13 @@ for index, row in table.iterrows(): sample_name = sample collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') locationx = row['City']+", "+row['State']+", USA" - location = "http://www.wikidata.org/enitity/Q16563" # Memphis by default + location = "http://www.wikidata.org/entity/Q16563" # Memphis by default map = { - "Pegram": "http://www.wikidata.org/enitity/Q3289517", - "Alexander": "http://www.wikidata.org/enitity/Q79663", - "Smithville": "http://www.wikidata.org/enitity/Q2145339", - "Nashville": "http://www.wikidata.org/enitity/Q23197", - "Madison": "http://www.wikidata.org/enitity/Q494755" + "Pegram": "http://www.wikidata.org/entity/Q3289517", + "Alexander": "http://www.wikidata.org/entity/Q79663", + "Smithville": "http://www.wikidata.org/entity/Q2145339", + "Nashville": "http://www.wikidata.org/entity/Q23197", + "Madison": "http://www.wikidata.org/entity/Q494755" } for name in map: diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl index fe50b16..728877f 100644 --- a/semantic_enrichment/countries.ttl +++ b/semantic_enrichment/countries.ttl @@ -220,7 +220,6 @@ ns1:P17 ; ns1:P625 "Point(-120.0 37.0)" . - rdfs:label "Brazil" ; ns1:P17 ; ns1:P30 ; @@ -1157,7 +1156,9 @@ ns1:P625 "Point(28.0 -14.0)" . rdfs:label "Mexico" ; - ns1:P30 . + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(-102.0 23.0)" . rdfs:label "Morocco" ; ns1:P17 ; @@ -1170,32 +1171,17 @@ ; ns1:P625 "Point(94.25 66.416666666)" . - rdfs:label "Germany" ; - ns1:P30 . - - rdfs:label "Spain" ; - ns1:P30 . - - rdfs:label "Italy" ; - ns1:P30 . - rdfs:label "Netherlands" ; ns1:P17 ; ns1:P30 , ; ns1:P625 "Point(5.55 52.316666666)" . - rdfs:label "Jordan" ; - ns1:P30 . - rdfs:label "Sierra Leone" ; ns1:P17 ; ns1:P30 ; ns1:P625 "Point(-11.916667 8.5)" . - rdfs:label "Japan" ; - ns1:P30 . - rdfs:label "Tunisia" ; ns1:P17 ; ns1:P30 ; @@ -1348,6 +1334,61 @@ ns1:P17 ; ns1:P625 "Point(-85.936008 32.933157)" . + rdfs:label "Mehsana" ; + ns1:P17 ; + ns1:P625 "Point(72.4 23.6)" . + + rdfs:label "Kathmandu" ; + ns1:P17 ; + ns1:P625 "Point(85.366666666 27.716666666)" . + + rdfs:label "Milan" ; + ns1:P17 ; + ns1:P625 "Point(9.19 45.466944444)" . + + rdfs:label "Telangana" ; + ns1:P17 ; + ns1:P625 "Point(79.59 17.99)" . + + rdfs:label "Philippines" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(123.0 12.0)" . + + rdfs:label "Mehsana" ; + ns1:P17 ; + ns1:P625 "Point(72.4 23.6)" . + + rdfs:label "Japan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(136.0 35.0)" . + + rdfs:label "Germany" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(10.0 51.0)" . + + rdfs:label "Spain" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(-3.5 40.2)" . + + rdfs:label "Jordan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(36.5 31.2)" . + + rdfs:label "Pakistan" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(71.0 30.0)" . + + rdfs:label "Philippines" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(123.0 12.0)" . + rdfs:label "Oceania" . rdfs:label "North America" . rdfs:label "South America" . -- cgit v1.2.3 From 7c74a20b90ca647ca387eff2ed830c22f5ba1282 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 12:48:00 +0000 Subject: Country trouble shooting --- doc/INSTALL.md | 1 + scripts/db_enrichment/country_enrichment.py | 29 ++++++++++++++++++++++++++--- scripts/db_enrichment/input_location.csv | 10 ---------- scripts/db_enrichment/readme.md | 12 +++++++----- 4 files changed, 34 insertions(+), 18 deletions(-) (limited to 'scripts') diff --git a/doc/INSTALL.md b/doc/INSTALL.md index 45aca0f..367b452 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -77,6 +77,7 @@ Note: see above on GUIX_PACKAGE_PATH. ## Run country semantic enrichment script cd bh20-seq-resource/scripts/db_enrichment + edit input_location.csv guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py ## Run the tests diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index 1f99d42..f62a64e 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -39,14 +39,36 @@ def callSPARQL(query): g = Graph() +test_query=""" +# Use with https://query.wikidata.org/ +SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE { + BIND (XXX as ?a) . + OPTIONAL { + ?a wdt:P625 ?coor. + } + ?a rdfs:label ?label . + ?a wdt:P17 ?country. + ?country rdfs:label ?country_label . + OPTIONAL { + ?country wdt:P30 ?continent. + ?continent rdfs:label ?continent_label + FILTER (lang(?continent_label)='en') + } + FILTER (lang(?country_label)='en') + FILTER (lang(?label)='en') +} +""" + +# wdt:P625 are GEO coordinates + query = """ construct { ?a wdt:P625 ?c. ?a rdfs:label ?label . ?a wdt:P17 ?country. ?country rdfs:label ?country_label . - ?country wdt:P30 ?continent. - ?continent rdfs:label ?continent_label + ?country wdt:P30 ?continent . + ?continent rdfs:label ?continent_label . } WHERE { BIND (XXX as ?a) . @@ -59,7 +81,6 @@ construct { FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') FILTER (lang(?label)='en') - } """"" @@ -72,6 +93,8 @@ with open(outputFile, 'r') as csvfile: counter=counter+1 try: + testq = test_query.replace("XXX", "<"+row[0]+">") + print(testq) tmpquery=query.replace("XXX", "<"+row[0]+">") print(tmpquery) diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv index a4246cd..8c3308f 100644 --- a/scripts/db_enrichment/input_location.csv +++ b/scripts/db_enrichment/input_location.csv @@ -1,16 +1,6 @@ http://www.wikidata.org/entity/Q7960498 http://www.wikidata.org/entity/Q692895 -http://www.wikidata.org/entity/Q928 http://www.wikidata.org/entity/Q2722074 http://www.wikidata.org/entity/Q25622187 http://www.wikidata.org/entity/Q27684996 http://www.wikidata.org/entity/Q2757125 -http://www.wikidata.org/entity/Q1922283 -http://www.wikidata.org/entity/Q490 -http://www.wikidata.org/entity/Q677037 -http://www.wikidata.org/entity/Q3037 -http://www.wikidata.org/entity/Q843 -http://www.wikidata.org/entity/Q183 -http://www.wikidata.org/entity/Q29 -http://www.wikidata.org/entity/Q17 -http://www.wikidata.org/entity/Q810 diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md index 88e8be5..7539104 100644 --- a/scripts/db_enrichment/readme.md +++ b/scripts/db_enrichment/readme.md @@ -11,11 +11,13 @@ File containing information about the countries in our database. Additional info This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet: ->SELECT DISTINCT ?geoLocation WHERE ->{ ->?fasta ?x [ <> ?geoLocation] . ->FILTER NOT EXISTS {?geoLocation <> ?geoLocation_tmp_label} ->} +```sparql +SELECT DISTINCT ?geoLocation WHERE +{ + ?fasta ?x [ ?geoLocation] . + FILTER NOT EXISTS {?geoLocation ?geoLocation_tmp_label} +} +``` [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) -- cgit v1.2.3 From 6c654dd60f98d473ba94fda6143d8b8b00f99586 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 6 Nov 2020 12:58:38 +0000 Subject: Add country entries that miss coordinates --- scripts/db_enrichment/country_enrichment.py | 2 +- semantic_enrichment/countries.ttl | 30 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'scripts') diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py index f62a64e..37329fb 100644 --- a/scripts/db_enrichment/country_enrichment.py +++ b/scripts/db_enrichment/country_enrichment.py @@ -72,7 +72,6 @@ construct { } WHERE { BIND (XXX as ?a) . - ?a wdt:P625 ?c. ?a rdfs:label ?label . ?a wdt:P17 ?country. ?country rdfs:label ?country_label . @@ -81,6 +80,7 @@ construct { FILTER (lang(?continent_label)='en') FILTER (lang(?country_label)='en') FILTER (lang(?label)='en') + OPTIONAL { ?a wdt:P625 ?c } } """"" diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl index 728877f..b0651cf 100644 --- a/semantic_enrichment/countries.ttl +++ b/semantic_enrichment/countries.ttl @@ -348,11 +348,6 @@ ns1:P30 ; ns1:P625 "Point(137.0 -28.0)" . - rdfs:label "India" ; - ns1:P17 ; - ns1:P30 ; - ns1:P625 "Point(83.0 22.8)" . - rdfs:label "Colombia" ; ns1:P17 ; ns1:P30 ; @@ -1389,6 +1384,31 @@ ns1:P30 ; ns1:P625 "Point(123.0 12.0)" . + rdfs:label "Bayad" ; + ns1:P17 ; + ns1:P625 "Point(73.0 20.8)" . + + rdfs:label "Choryasi Taluka" ; + ns1:P17 ; + ns1:P625 "Point(73.0 20.8)" . + + rdfs:label "Daskroi" ; + ns1:P17 ; + ns1:P625 "Point(72.0 22.0)" . + + rdfs:label "Wairarapa" ; + ns1:P17 ; + ns1:P625 "Point(174.0 -41.2)" . + + rdfs:label "Waitemata City" ; + ns1:P17 ; + ns1:P625 "Point(174.0 -41.2)" . + + rdfs:label "India" ; + ns1:P17 ; + ns1:P30 ; + ns1:P625 "Point(83.0 22.8)" . + rdfs:label "Oceania" . rdfs:label "North America" . rdfs:label "South America" . -- cgit v1.2.3 From cb2200839e180d518167bce06395ae04c332ddf4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 8 Nov 2020 10:35:03 +0000 Subject: UTHSC samples --- example/uthsc_example.yaml | 4 ++-- scripts/uthsc_samples/template.yaml | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'scripts') diff --git a/example/uthsc_example.yaml b/example/uthsc_example.yaml index 18188dc..956faf1 100644 --- a/example/uthsc_example.yaml +++ b/example/uthsc_example.yaml @@ -1,10 +1,10 @@ id: placeholder license: - license_type: http://creativecommons.org/licenses/by/4.0/ + license_type: https://creativecommons.org/licenses/by/4.0/ title: "Sample" attribution_name: "Mariah Taylor, Colleen Jonsson" - attribution_url: https://www.uthsc.edu/rbl/ + attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php host: host_id: TN_UT2 diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml index 1175ac8..07e0828 100644 --- a/scripts/uthsc_samples/template.yaml +++ b/scripts/uthsc_samples/template.yaml @@ -1,9 +1,9 @@ id: placeholder license: - license_type: http://creativecommons.org/licenses/by/4.0/ - title: "$sample_name - $locationx" - attribution_name: "Mariah Taylor, Colleen Jonsson" + license_type: https://creativecommons.org/licenses/by/4.0/ + title: "$strain" + attribution_name: "Mariah Taylor, Colleen B. Jonsson" attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php host: @@ -23,10 +23,10 @@ virus: technology: sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] sequence_assembly_method: https://bio.tools/BWA#! - additional_technology_information: Oxford Nanopore MiniIon RNA long reads + additional_technology_information: "Oxford Nanopore MiniIon RNA long reads" submitter: - authors: [Mariah Taylor, Colleen Jonsson] + authors: [Mariah Taylor, Colleen B. Jonsson] submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins] submitter_address: UTHSC, Memphis, Tennessee 38163, USA originating_lab: Regional Biocontainment Laboratory, Memphis, TN -- cgit v1.2.3 From b311e2ec0f1d02cf16152855dd8bdd760ed4578b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 8 Nov 2020 10:50:24 +0000 Subject: Uploaded UTHSC sequences --- scripts/uthsc_samples/uthsc_samples.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'scripts') diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py index 3ad2561..54c70ee 100644 --- a/scripts/uthsc_samples/uthsc_samples.py +++ b/scripts/uthsc_samples/uthsc_samples.py @@ -55,3 +55,5 @@ for index, row in table.iterrows(): locationx=locationx, strain=strain )) + + print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa") -- cgit v1.2.3 From 11812bbf95ddf1771a159b7ef6580a9179c0cad1 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Nov 2020 05:02:57 -0600 Subject: virtuoso: Added a --no-cache option --- scripts/update_virtuoso/check_for_updates.py | 34 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'scripts') diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py index 8761c8a..a63f4d1 100755 --- a/scripts/update_virtuoso/check_for_updates.py +++ b/scripts/update_virtuoso/check_for_updates.py @@ -4,7 +4,7 @@ # # You can run this in a Guix container with # -# ~/opt/guix/bin/guix environment -C guix --ad-hoc python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba +# ~/opt/guix/bin/guix environment -C guix --ad-hoc raptor2 python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba # # Note you'll need to run from the root dir. Remove the ./cache.txt file if you want to force an update. # @@ -19,6 +19,11 @@ fn = sys.argv[1] user = sys.argv[2] pwd = sys.argv[3] +no_cache = False +if fn == "--no-cache": + no_cache = True + print("Skipping cache check and download of metadata.ttl") + scriptdir = os.path.dirname(os.path.realpath(__file__)) print(scriptdir) basedir = os.path.dirname(os.path.dirname(scriptdir)) @@ -29,6 +34,15 @@ def upload(fn): # print("DELETE "+fn) # cmd = ("curl --digest --user dba:%s --verbose --url -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph -X DELETE" % pwd).split(" ") + print("VALIDATE "+fn) + cmd = f"rapper -i turtle {fn}" + print(cmd) + p = subprocess.Popen(cmd.split(" "),stdout=subprocess.PIPE,stderr=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + print(out,err) + assert(p.returncode == 0) + print("UPLOADING "+fn) cmd = ("curl -X PUT --digest -u dba:%s -H Content-Type:text/turtle -T %s -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph/%s" % (pwd, fn, os.path.basename(fn)) ) print(cmd) @@ -39,6 +53,7 @@ def upload(fn): url = 'https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl' # --- Fetch headers from TTL file on Arvados +# curl --head https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl r = requests.head(url) print(r.headers) print(r.headers['Last-Modified']) @@ -49,14 +64,14 @@ last_modified_str = r.headers['Last-Modified'] t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" ) print(t_stamp) -# OK, it works, now check last stored value +# OK, it works, now check last stored value in the cache stamp = None if os.path.isfile(fn): file = open(fn,"r") stamp = file.read() file.close -if stamp != last_modified_str: +if no_cache or stamp != last_modified_str: print("Delete graphs") for graph in ["labels.ttl", "metadata.ttl", "countries.ttl"]: cmd = ("curl --digest -u dba:%s --verbose --url http://127.0.0.1:8890/sparql-graph-crud-auth?graph=http://covid-19.genenetwork.org/graph/%s -X DELETE" % (pwd, graph)) @@ -69,12 +84,13 @@ if stamp != last_modified_str: upload(basedir+"/semantic_enrichment/labels.ttl") upload(basedir+"/semantic_enrichment/countries.ttl") - print("Fetch metadata TTL") - r = requests.get(url) - assert(r.status_code == 200) - with open("metadata.ttl", "w") as f: - f.write(r.text) - f.close + if not no_cache: + print("Fetch metadata TTL") + r = requests.get(url) + assert(r.status_code == 200) + with open("metadata.ttl", "w") as f: + f.write(r.text) + f.close upload("metadata.ttl") with open(fn,"w") as f: f.write(last_modified_str) -- cgit v1.2.3 From 986bacf77191a159d842605f6bb86f3f92a3be54 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Nov 2020 11:41:09 +0000 Subject: virtuoso: header change --- scripts/update_virtuoso/check_for_updates.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'scripts') diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py index a63f4d1..fb66c2e 100755 --- a/scripts/update_virtuoso/check_for_updates.py +++ b/scripts/update_virtuoso/check_for_updates.py @@ -56,13 +56,14 @@ url = 'https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mer # curl --head https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl r = requests.head(url) print(r.headers) -print(r.headers['Last-Modified']) +if not no_cache: + print(r.headers['Last-Modified']) -# --- Convert/validate time stamp -# ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y' -last_modified_str = r.headers['Last-Modified'] -t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" ) -print(t_stamp) + # --- Convert/validate time stamp + # ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y' + last_modified_str = r.headers['Last-Modified'] + t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" ) + print(t_stamp) # OK, it works, now check last stored value in the cache stamp = None -- cgit v1.2.3 From b3eb10770bada631c929fec83247f6fda7ef22a4 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Tue, 10 Nov 2020 06:39:10 -0600 Subject: virtuoso: no-cache --- scripts/update_virtuoso/check_for_updates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py index fb66c2e..939a575 100755 --- a/scripts/update_virtuoso/check_for_updates.py +++ b/scripts/update_virtuoso/check_for_updates.py @@ -93,7 +93,8 @@ if no_cache or stamp != last_modified_str: f.write(r.text) f.close upload("metadata.ttl") - with open(fn,"w") as f: - f.write(last_modified_str) + if not no_cache: + with open(fn,"w") as f: + f.write(last_modified_str) else: print("Metadata is up to date") -- cgit v1.2.3