aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorlltommy2020-11-11 09:56:12 +0100
committerlltommy2020-11-11 09:56:12 +0100
commitd6aa323b6fc7a82e45cc1df51fc72c2d547146eb (patch)
tree6e8b77bde4dc34fab3fa8804906f3cb821f61dae /scripts
parentc5fe5de7e4c77bfb48b1ae2f662c2d9cc120c06e (diff)
parentc872248e43c1c66e5fed8ef341f7b4ac21d63e6f (diff)
downloadbh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.gz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.tar.lz
bh20-seq-resource-d6aa323b6fc7a82e45cc1df51fc72c2d547146eb.zip
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
Diffstat (limited to 'scripts')
-rw-r--r--scripts/db_enrichment/.gitignore1
-rw-r--r--scripts/db_enrichment/country_enrichment.py64
-rw-r--r--scripts/db_enrichment/input_location.csv8
-rw-r--r--scripts/db_enrichment/readme.md14
-rwxr-xr-xscripts/update_virtuoso/check_for_updates.py52
-rw-r--r--scripts/uthsc_samples/.gitignore1
-rw-r--r--scripts/uthsc_samples/template.yaml35
-rw-r--r--scripts/uthsc_samples/uthsc_samples.py59
8 files changed, 193 insertions, 41 deletions
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore
new file mode 100644
index 0000000..30b159b
--- /dev/null
+++ b/scripts/db_enrichment/.gitignore
@@ -0,0 +1 @@
+enriched_output.txt
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..37329fb 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+# ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+# ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
import requests
import csv
from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,49 @@ def callSPARQL(query):
g = Graph()
+test_query="""
+# Use with https://query.wikidata.org/
+SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
+ BIND (XXX as ?a) .
+ OPTIONAL {
+ ?a wdt:P625 ?coor.
+ }
+ ?a rdfs:label ?label .
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ OPTIONAL {
+ ?country wdt:P30 ?continent.
+ ?continent rdfs:label ?continent_label
+ FILTER (lang(?continent_label)='en')
+ }
+ FILTER (lang(?country_label)='en')
+ FILTER (lang(?label)='en')
+}
+"""
+# wdt:P625 are GEO coordinates
query = """
construct {
- ?a wdt:P625 ?c.
+ ?a wdt:P625 ?c.
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
- ?continent rdfs:label ?continent_label
-} WHERE
-{
- BIND (XXX as ?a) .
- ?a wdt:P625 ?c.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent .
+ ?continent rdfs:label ?continent_label .
+} WHERE
+{
+ BIND (XXX as ?a) .
?a rdfs:label ?label .
- ?a wdt:P17 ?country.
- ?country rdfs:label ?country_label .
- ?country wdt:P30 ?continent.
+ ?a wdt:P17 ?country.
+ ?country rdfs:label ?country_label .
+ ?country wdt:P30 ?continent.
?continent rdfs:label ?continent_label
- FILTER (lang(?continent_label)='en')
+ FILTER (lang(?continent_label)='en')
FILTER (lang(?country_label)='en')
- FILTER (lang(?label)='en')
-
-}
+ FILTER (lang(?label)='en')
+ OPTIONAL { ?a wdt:P625 ?c }
+}
"""""
outputFile = 'input_location.csv'
@@ -65,6 +93,8 @@ with open(outputFile, 'r') as csvfile:
counter=counter+1
try:
+ testq = test_query.replace("XXX", "<"+row[0]+">")
+ print(testq)
tmpquery=query.replace("XXX", "<"+row[0]+">")
print(tmpquery)
@@ -88,4 +118,4 @@ with open(outputFile, 'r') as csvfile:
raise
print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle') \ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index 364afc8..8c3308f 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,2 +1,6 @@
-http://www.wikidata.org/entity/Q111904
-http://www.wikidata.org/entity/Q1070 \ No newline at end of file
+http://www.wikidata.org/entity/Q7960498
+http://www.wikidata.org/entity/Q692895
+http://www.wikidata.org/entity/Q2722074
+http://www.wikidata.org/entity/Q25622187
+http://www.wikidata.org/entity/Q27684996
+http://www.wikidata.org/entity/Q2757125
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 55ec496..7539104 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -11,11 +11,15 @@ File containing information about the countries in our database. Additional info
This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet:
->SELECT DISTINCT ?geoLocation WHERE
->{
->?fasta ?x [ <<http://purl.obolibrary.org/obo/GAZ_00000448>> ?geoLocation] .
->FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
->}
+```sparql
+SELECT DISTINCT ?geoLocation WHERE
+{
+ ?fasta ?x [ <http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+ FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+}
+```
+
+[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
- Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
- Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py
index 8761c8a..939a575 100755
--- a/scripts/update_virtuoso/check_for_updates.py
+++ b/scripts/update_virtuoso/check_for_updates.py
@@ -4,7 +4,7 @@
#
# You can run this in a Guix container with
#
-# ~/opt/guix/bin/guix environment -C guix --ad-hoc python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba
+# ~/opt/guix/bin/guix environment -C guix --ad-hoc raptor2 python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba
#
# Note you'll need to run from the root dir. Remove the ./cache.txt file if you want to force an update.
#
@@ -19,6 +19,11 @@ fn = sys.argv[1]
user = sys.argv[2]
pwd = sys.argv[3]
+no_cache = False
+if fn == "--no-cache":
+ no_cache = True
+ print("Skipping cache check and download of metadata.ttl")
+
scriptdir = os.path.dirname(os.path.realpath(__file__))
print(scriptdir)
basedir = os.path.dirname(os.path.dirname(scriptdir))
@@ -29,6 +34,15 @@ def upload(fn):
# print("DELETE "+fn)
# cmd = ("curl --digest --user dba:%s --verbose --url -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph -X DELETE" % pwd).split(" ")
+ print("VALIDATE "+fn)
+ cmd = f"rapper -i turtle {fn}"
+ print(cmd)
+ p = subprocess.Popen(cmd.split(" "),stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out, err = p.communicate()
+ if p.returncode != 0:
+ print(out,err)
+ assert(p.returncode == 0)
+
print("UPLOADING "+fn)
cmd = ("curl -X PUT --digest -u dba:%s -H Content-Type:text/turtle -T %s -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph/%s" % (pwd, fn, os.path.basename(fn)) )
print(cmd)
@@ -39,24 +53,26 @@ def upload(fn):
url = 'https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl'
# --- Fetch headers from TTL file on Arvados
+# curl --head https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl
r = requests.head(url)
print(r.headers)
-print(r.headers['Last-Modified'])
+if not no_cache:
+ print(r.headers['Last-Modified'])
-# --- Convert/validate time stamp
-# ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y'
-last_modified_str = r.headers['Last-Modified']
-t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" )
-print(t_stamp)
+ # --- Convert/validate time stamp
+ # ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y'
+ last_modified_str = r.headers['Last-Modified']
+ t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" )
+ print(t_stamp)
-# OK, it works, now check last stored value
+# OK, it works, now check last stored value in the cache
stamp = None
if os.path.isfile(fn):
file = open(fn,"r")
stamp = file.read()
file.close
-if stamp != last_modified_str:
+if no_cache or stamp != last_modified_str:
print("Delete graphs")
for graph in ["labels.ttl", "metadata.ttl", "countries.ttl"]:
cmd = ("curl --digest -u dba:%s --verbose --url http://127.0.0.1:8890/sparql-graph-crud-auth?graph=http://covid-19.genenetwork.org/graph/%s -X DELETE" % (pwd, graph))
@@ -69,14 +85,16 @@ if stamp != last_modified_str:
upload(basedir+"/semantic_enrichment/labels.ttl")
upload(basedir+"/semantic_enrichment/countries.ttl")
- print("Fetch metadata TTL")
- r = requests.get(url)
- assert(r.status_code == 200)
- with open("metadata.ttl", "w") as f:
- f.write(r.text)
- f.close
+ if not no_cache:
+ print("Fetch metadata TTL")
+ r = requests.get(url)
+ assert(r.status_code == 200)
+ with open("metadata.ttl", "w") as f:
+ f.write(r.text)
+ f.close
upload("metadata.ttl")
- with open(fn,"w") as f:
- f.write(last_modified_str)
+ if not no_cache:
+ with open(fn,"w") as f:
+ f.write(last_modified_str)
else:
print("Metadata is up to date")
diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore
new file mode 100644
index 0000000..8786e3f
--- /dev/null
+++ b/scripts/uthsc_samples/.gitignore
@@ -0,0 +1 @@
+yaml
diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml
new file mode 100644
index 0000000..07e0828
--- /dev/null
+++ b/scripts/uthsc_samples/template.yaml
@@ -0,0 +1,35 @@
+id: placeholder
+
+license:
+ license_type: https://creativecommons.org/licenses/by/4.0/
+ title: "$strain"
+ attribution_name: "Mariah Taylor, Colleen B. Jonsson"
+ attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
+
+host:
+ host_id: "$sample_id"
+ host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+sample:
+ sample_id: "$sample_id"
+ specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
+ collection_date: "$collection_date"
+ collection_location: $location
+
+virus:
+ virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+ virus_strain: "$strain"
+
+technology:
+ sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+ sequence_assembly_method: https://bio.tools/BWA#!
+ additional_technology_information: "Oxford Nanopore MiniIon RNA long reads"
+
+submitter:
+ authors: [Mariah Taylor, Colleen B. Jonsson]
+ submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
+ submitter_address: UTHSC, Memphis, Tennessee 38163, USA
+ originating_lab: Regional Biocontainment Laboratory, Memphis, TN
+ provider_sample_id: $sample_id
+ submitter_sample_id: $sample_id
+ submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
new file mode 100644
index 0000000..54c70ee
--- /dev/null
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -0,0 +1,59 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+import re
+
+import sys
+
+# Metadata in tabular format in a spreadsheet(?!)
+xlsx = '../../test/data/10_samples.xlsx'
+
+# Template in a text file
+template_yaml = 'template.yaml'
+
+dir_output = 'yaml'
+
+if not os.path.exists(dir_output):
+ os.makedirs(dir_output)
+
+table = pd.read_excel(xlsx)
+
+print(table)
+
+for index, row in table.iterrows():
+ sample = row['Sample ID']
+ print(f"Processing sample {sample}...")
+
+ with open(template_yaml) as f:
+ text = Template(f.read())
+ with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
+ sample_id = sample
+ sample_name = sample
+ collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
+ locationx = row['City']+", "+row['State']+", USA"
+ location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
+ map = {
+ "Pegram": "http://www.wikidata.org/entity/Q3289517",
+ "Alexander": "http://www.wikidata.org/entity/Q79663",
+ "Smithville": "http://www.wikidata.org/entity/Q2145339",
+ "Nashville": "http://www.wikidata.org/entity/Q23197",
+ "Madison": "http://www.wikidata.org/entity/Q494755"
+ }
+
+ for name in map:
+ p = re.compile(name)
+ if p.match(locationx):
+ location = map[name]
+ break
+
+ strain = f"SARS-CoV-2/human/USA/{sample}/2020"
+ fw.write(text.substitute(sample_id=sample_id,
+ sample_name=sample_name,
+ collection_date=collection_date,
+ location=location,
+ locationx=locationx,
+ strain=strain
+ ))
+
+ print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")