From 43d7264dda8061a024befbc9ca0a89d7159b1e40 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 09:52:32 +0000
Subject: UTHSC upload info

---
 scripts/uthsc_samples/.gitignore       |  1 +
 scripts/uthsc_samples/template.yaml    | 35 +++++++++++++++++++++
 scripts/uthsc_samples/uthsc_samples.py | 57 ++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 scripts/uthsc_samples/.gitignore
 create mode 100644 scripts/uthsc_samples/template.yaml
 create mode 100644 scripts/uthsc_samples/uthsc_samples.py

(limited to 'scripts')

diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore
new file mode 100644
index 0000000..8786e3f
--- /dev/null
+++ b/scripts/uthsc_samples/.gitignore
@@ -0,0 +1 @@
+yaml
diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml
new file mode 100644
index 0000000..1175ac8
--- /dev/null
+++ b/scripts/uthsc_samples/template.yaml
@@ -0,0 +1,35 @@
+id: placeholder
+
+license:
+    license_type: http://creativecommons.org/licenses/by/4.0/
+    title: "$sample_name - $locationx"
+    attribution_name: "Mariah Taylor, Colleen Jonsson"
+    attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
+
+host:
+    host_id: "$sample_id"
+    host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+sample:
+    sample_id: "$sample_id"
+    specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
+    collection_date: "$collection_date"
+    collection_location: $location
+
+virus:
+    virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+    virus_strain: "$strain"
+
+technology:
+    sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+    sequence_assembly_method: https://bio.tools/BWA#!
+    additional_technology_information: Oxford Nanopore MiniIon RNA long reads
+
+submitter:
+    authors: [Mariah Taylor, Colleen Jonsson]
+    submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
+    submitter_address: UTHSC, Memphis, Tennessee 38163, USA
+    originating_lab: Regional Biocontainment Laboratory, Memphis, TN
+    provider_sample_id: $sample_id
+    submitter_sample_id: $sample_id
+    submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
new file mode 100644
index 0000000..5c39398
--- /dev/null
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -0,0 +1,57 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+import re
+
+import sys
+
+# Metadata in tabular format in a spreadsheet(?!)
+xlsx = '../../test/data/10_samples.xlsx'
+
+# Template in a text file
+template_yaml = 'template.yaml'
+
+dir_output = 'yaml'
+
+if not os.path.exists(dir_output):
+    os.makedirs(dir_output)
+
+table = pd.read_excel(xlsx)
+
+print(table)
+
+for index, row in table.iterrows():
+    sample = row['Sample ID']
+    print(f"Processing sample {sample}...")
+
+    with open(template_yaml) as f:
+      text = Template(f.read())
+      with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
+          sample_id = sample
+          sample_name = sample
+          collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
+          locationx = row['City']+", "+row['State']+", USA"
+          location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default
+          map = {
+              "Pegram": "https://www.wikidata.org/wiki/Q3289517",
+              "Alexander": "https://www.wikidata.org/wiki/Q79663",
+              "Smithville": "https://www.wikidata.org/wiki/Q2145339",
+              "Nashville": "https://www.wikidata.org/wiki/Q23197",
+              "Madison": "https://www.wikidata.org/wiki/Q494755"
+              }
+
+          for name in map:
+              p = re.compile(name)
+              if p.match(locationx):
+                  location = map[name]
+                  break
+
+          strain = f"SARS-CoV-2/human/USA/{sample}/2020"
+          fw.write(text.substitute(sample_id=sample_id,
+                                   sample_name=sample_name,
+                                   collection_date=collection_date,
+                                   location=location,
+                                   locationx=locationx,
+                                   strain=strain
+                                   ))
-- 
cgit 1.4.1


From d75f1c74fbf86652b02520de6ed46c981cf27e50 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 10:13:05 +0000
Subject: Adding Tennessee items

---
 doc/INSTALL.md                              |  5 ++++
 scripts/db_enrichment/.gitignore            |  1 +
 scripts/db_enrichment/country_enrichment.py | 43 +++++++++++++++++------------
 scripts/db_enrichment/input_location.csv    |  7 +++--
 scripts/db_enrichment/readme.md             |  2 ++
 semantic_enrichment/countries.ttl           | 18 ++++++++++++
 6 files changed, 56 insertions(+), 20 deletions(-)
 create mode 100644 scripts/db_enrichment/.gitignore

(limited to 'scripts')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 96cf1d4..f54c8f2 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$  env GUIX_PACKAGE_PATH=~/i
 
 Note: see above on GUIX_PACKAGE_PATH.
 
+## Run country semantic enrichment script
+
+    cd bh20-seq-resource/scripts/db_enrichment
+    guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py
+
 ## Run the tests
 
     guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore
new file mode 100644
index 0000000..30b159b
--- /dev/null
+++ b/scripts/db_enrichment/.gitignore
@@ -0,0 +1 @@
+enriched_output.txt
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..1f99d42 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+#    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+#    ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
 import requests
 import csv
 from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,28 @@ def callSPARQL(query):
 
 g = Graph()
 
-
-
 query = """
 construct {
-    ?a wdt:P625 ?c. 
+    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label . 
-    ?country wdt:P30 ?continent. 
-    ?continent rdfs:label ?continent_label   
-} WHERE 
-{ 
-    BIND (XXX as ?a) . 
-    ?a wdt:P625 ?c. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent.
+    ?continent rdfs:label ?continent_label
+} WHERE
+{
+    BIND (XXX as ?a) .
+    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label .    
-    ?country wdt:P30 ?continent. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent.
     ?continent rdfs:label ?continent_label
-    FILTER (lang(?continent_label)='en')           
+    FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
-    FILTER (lang(?label)='en') 
+    FILTER (lang(?label)='en')
 
-}  
+}
 """""
 
 outputFile = 'input_location.csv'
@@ -88,4 +95,4 @@ with open(outputFile, 'r') as csvfile:
             raise
 
 print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index 364afc8..eb5322a 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,2 +1,5 @@
-http://www.wikidata.org/entity/Q111904
-http://www.wikidata.org/entity/Q1070
\ No newline at end of file
+http://www.wikidata.org/entity/Q3289517
+http://www.wikidata.org/entity/Q79663
+http://www.wikidata.org/entity/Q2145339
+http://www.wikidata.org/entity/Q23197
+http://www.wikidata.org/entity/Q494755
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 55ec496..88e8be5 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -17,5 +17,7 @@ This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countrie
 >FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
 >}
 
+[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
+
 - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
 - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
index 08e9c38..fe50b16 100644
--- a/semantic_enrichment/countries.ttl
+++ b/semantic_enrichment/countries.ttl
@@ -1328,7 +1328,25 @@
     ns1:P17 <http://www.wikidata.org/entity/Q79> ;
     ns1:P625 "Point(31.239444444 30.056111111)" .
 
+<http://www.wikidata.org/entity/Q2145339> rdfs:label "Smithville" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-85.820833333 35.957222222)" .
+
+<http://www.wikidata.org/entity/Q23197> rdfs:label "Nashville" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-86.783888888 36.165)" .
+
+<http://www.wikidata.org/entity/Q3289517> rdfs:label "Pegram" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-87.051666666 36.101666666)" .
+
+<http://www.wikidata.org/entity/Q494755> rdfs:label "Madison County" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-88.84 35.61)" .
 
+<http://www.wikidata.org/entity/Q79663> rdfs:label "Alexander City" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-85.936008 32.933157)" .
 
 <http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" .
 <http://www.wikidata.org/entity/Q49> rdfs:label "North America" .
-- 
cgit 1.4.1


From 039ad90b1627464b687adddb08cf489dca3c5fbc Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 10:14:21 +0000
Subject: Using correct wikidata geo link

---
 scripts/uthsc_samples/uthsc_samples.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'scripts')

diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
index 5c39398..c18c07a 100644
--- a/scripts/uthsc_samples/uthsc_samples.py
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -32,13 +32,13 @@ for index, row in table.iterrows():
           sample_name = sample
           collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
           locationx = row['City']+", "+row['State']+", USA"
-          location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default
+          location = "http://www.wikidata.org/enitity/Q16563" # Memphis by default
           map = {
-              "Pegram": "https://www.wikidata.org/wiki/Q3289517",
-              "Alexander": "https://www.wikidata.org/wiki/Q79663",
-              "Smithville": "https://www.wikidata.org/wiki/Q2145339",
-              "Nashville": "https://www.wikidata.org/wiki/Q23197",
-              "Madison": "https://www.wikidata.org/wiki/Q494755"
+              "Pegram": "http://www.wikidata.org/enitity/Q3289517",
+              "Alexander": "http://www.wikidata.org/enitity/Q79663",
+              "Smithville": "http://www.wikidata.org/enitity/Q2145339",
+              "Nashville": "http://www.wikidata.org/enitity/Q23197",
+              "Madison": "http://www.wikidata.org/enitity/Q494755"
               }
 
           for name in map:
-- 
cgit 1.4.1


From 951ebe949d88cdbfed028e0a2a420ce7921c3919 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 10:31:56 +0000
Subject: Countries

---
 scripts/db_enrichment/input_location.csv | 21 ++++++---
 scripts/uthsc_samples/uthsc_samples.py   | 12 ++---
 semantic_enrichment/countries.ttl        | 75 ++++++++++++++++++++++++--------
 3 files changed, 80 insertions(+), 28 deletions(-)

(limited to 'scripts')

diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index eb5322a..a4246cd 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,5 +1,16 @@
-http://www.wikidata.org/entity/Q3289517
-http://www.wikidata.org/entity/Q79663
-http://www.wikidata.org/entity/Q2145339
-http://www.wikidata.org/entity/Q23197
-http://www.wikidata.org/entity/Q494755
+http://www.wikidata.org/entity/Q7960498
+http://www.wikidata.org/entity/Q692895
+http://www.wikidata.org/entity/Q928
+http://www.wikidata.org/entity/Q2722074
+http://www.wikidata.org/entity/Q25622187
+http://www.wikidata.org/entity/Q27684996
+http://www.wikidata.org/entity/Q2757125
+http://www.wikidata.org/entity/Q1922283
+http://www.wikidata.org/entity/Q490
+http://www.wikidata.org/entity/Q677037
+http://www.wikidata.org/entity/Q3037
+http://www.wikidata.org/entity/Q843
+http://www.wikidata.org/entity/Q183
+http://www.wikidata.org/entity/Q29
+http://www.wikidata.org/entity/Q17
+http://www.wikidata.org/entity/Q810
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
index c18c07a..3ad2561 100644
--- a/scripts/uthsc_samples/uthsc_samples.py
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -32,13 +32,13 @@ for index, row in table.iterrows():
           sample_name = sample
           collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
           locationx = row['City']+", "+row['State']+", USA"
-          location = "http://www.wikidata.org/enitity/Q16563" # Memphis by default
+          location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
           map = {
-              "Pegram": "http://www.wikidata.org/enitity/Q3289517",
-              "Alexander": "http://www.wikidata.org/enitity/Q79663",
-              "Smithville": "http://www.wikidata.org/enitity/Q2145339",
-              "Nashville": "http://www.wikidata.org/enitity/Q23197",
-              "Madison": "http://www.wikidata.org/enitity/Q494755"
+              "Pegram": "http://www.wikidata.org/entity/Q3289517",
+              "Alexander": "http://www.wikidata.org/entity/Q79663",
+              "Smithville": "http://www.wikidata.org/entity/Q2145339",
+              "Nashville": "http://www.wikidata.org/entity/Q23197",
+              "Madison": "http://www.wikidata.org/entity/Q494755"
               }
 
           for name in map:
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
index fe50b16..728877f 100644
--- a/semantic_enrichment/countries.ttl
+++ b/semantic_enrichment/countries.ttl
@@ -220,7 +220,6 @@
     ns1:P17 <http://www.wikidata.org/entity/Q30> ;
     ns1:P625 "Point(-120.0 37.0)" .
 
-
 <http://www.wikidata.org/entity/Q155> rdfs:label "Brazil" ;
     ns1:P17 <http://www.wikidata.org/entity/Q155> ;
     ns1:P30 <http://www.wikidata.org/entity/Q18> ;
@@ -1157,7 +1156,9 @@
     ns1:P625 "Point(28.0 -14.0)" .
 
 <http://www.wikidata.org/entity/Q96> rdfs:label "Mexico" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q49> .
+    ns1:P17 <http://www.wikidata.org/entity/Q96> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q49> ;
+    ns1:P625 "Point(-102.0 23.0)" .
 
 <http://www.wikidata.org/entity/Q1028> rdfs:label "Morocco" ;
     ns1:P17 <http://www.wikidata.org/entity/Q1028> ;
@@ -1170,32 +1171,17 @@
         <http://www.wikidata.org/entity/Q48> ;
     ns1:P625 "Point(94.25 66.416666666)" .
 
-<http://www.wikidata.org/entity/Q183> rdfs:label "Germany" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q46> .
-
-<http://www.wikidata.org/entity/Q29> rdfs:label "Spain" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q46> .
-
-<http://www.wikidata.org/entity/Q38> rdfs:label "Italy" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q46> .
-
 <http://www.wikidata.org/entity/Q55> rdfs:label "Netherlands" ;
     ns1:P17 <http://www.wikidata.org/entity/Q29999> ;
     ns1:P30 <http://www.wikidata.org/entity/Q27611>,
         <http://www.wikidata.org/entity/Q46> ;
     ns1:P625 "Point(5.55 52.316666666)" .
 
-<http://www.wikidata.org/entity/Q810> rdfs:label "Jordan" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q48> .
-
 <http://www.wikidata.org/entity/Q1044> rdfs:label "Sierra Leone" ;
     ns1:P17 <http://www.wikidata.org/entity/Q1044> ;
     ns1:P30 <http://www.wikidata.org/entity/Q15> ;
     ns1:P625 "Point(-11.916667 8.5)" .
 
-<http://www.wikidata.org/entity/Q17> rdfs:label "Japan" ;
-    ns1:P30 <http://www.wikidata.org/entity/Q48> .
-
 <http://www.wikidata.org/entity/Q948> rdfs:label "Tunisia" ;
     ns1:P17 <http://www.wikidata.org/entity/Q948> ;
     ns1:P30 <http://www.wikidata.org/entity/Q15> ;
@@ -1348,6 +1334,61 @@
     ns1:P17 <http://www.wikidata.org/entity/Q30> ;
     ns1:P625 "Point(-85.936008 32.933157)" .
 
+<http://www.wikidata.org/entity/Q1922283> rdfs:label "Mehsana" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(72.4 23.6)" .
+
+<http://www.wikidata.org/entity/Q3037> rdfs:label "Kathmandu" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q837> ;
+    ns1:P625 "Point(85.366666666 27.716666666)" .
+
+<http://www.wikidata.org/entity/Q490> rdfs:label "Milan" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q38> ;
+    ns1:P625 "Point(9.19 45.466944444)" .
+
+<http://www.wikidata.org/entity/Q677037> rdfs:label "Telangana" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(79.59 17.99)" .
+
+<http://www.wikidata.org/entity/Q928> rdfs:label "Philippines" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q928> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(123.0 12.0)" .
+
+<http://www.wikidata.org/entity/Q1922283> rdfs:label "Mehsana" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(72.4 23.6)" .
+
+<http://www.wikidata.org/entity/Q17> rdfs:label "Japan" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q17> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(136.0 35.0)" .
+
+<http://www.wikidata.org/entity/Q183> rdfs:label "Germany" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q183> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q46> ;
+    ns1:P625 "Point(10.0 51.0)" .
+
+<http://www.wikidata.org/entity/Q29> rdfs:label "Spain" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q29> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q46> ;
+    ns1:P625 "Point(-3.5 40.2)" .
+
+<http://www.wikidata.org/entity/Q810> rdfs:label "Jordan" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q810> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(36.5 31.2)" .
+
+<http://www.wikidata.org/entity/Q843> rdfs:label "Pakistan" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q843> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(71.0 30.0)" .
+
+<http://www.wikidata.org/entity/Q928> rdfs:label "Philippines" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q928> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(123.0 12.0)" .
+
 <http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" .
 <http://www.wikidata.org/entity/Q49> rdfs:label "North America" .
 <http://www.wikidata.org/entity/Q18> rdfs:label "South America" .
-- 
cgit 1.4.1


From 7c74a20b90ca647ca387eff2ed830c22f5ba1282 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 12:48:00 +0000
Subject: Country trouble shooting

---
 doc/INSTALL.md                              |  1 +
 scripts/db_enrichment/country_enrichment.py | 29 ++++++++++++++++++++++++++---
 scripts/db_enrichment/input_location.csv    | 10 ----------
 scripts/db_enrichment/readme.md             | 12 +++++++-----
 4 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'scripts')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 45aca0f..367b452 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -77,6 +77,7 @@ Note: see above on GUIX_PACKAGE_PATH.
 ## Run country semantic enrichment script
 
     cd bh20-seq-resource/scripts/db_enrichment
+    edit input_location.csv
     guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py
 
 ## Run the tests
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 1f99d42..f62a64e 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -39,14 +39,36 @@ def callSPARQL(query):
 
 g = Graph()
 
+test_query="""
+# Use with https://query.wikidata.org/
+SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
+    BIND (XXX as ?a) .
+    OPTIONAL {
+        ?a wdt:P625 ?coor.
+    }
+    ?a rdfs:label ?label .
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    OPTIONAL {
+        ?country wdt:P30 ?continent.
+        ?continent rdfs:label ?continent_label
+        FILTER (lang(?continent_label)='en')
+    }
+    FILTER (lang(?country_label)='en')
+    FILTER (lang(?label)='en')
+}
+"""
+
+# wdt:P625 are GEO coordinates
+
 query = """
 construct {
     ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
     ?a wdt:P17 ?country.
     ?country rdfs:label ?country_label .
-    ?country wdt:P30 ?continent.
-    ?continent rdfs:label ?continent_label
+    ?country wdt:P30 ?continent .
+    ?continent rdfs:label ?continent_label .
 } WHERE
 {
     BIND (XXX as ?a) .
@@ -59,7 +81,6 @@ construct {
     FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
     FILTER (lang(?label)='en')
-
 }
 """""
 
@@ -72,6 +93,8 @@ with open(outputFile, 'r') as csvfile:
         counter=counter+1
 
         try:
+            testq = test_query.replace("XXX", "<"+row[0]+">")
+            print(testq)
             tmpquery=query.replace("XXX", "<"+row[0]+">")
             print(tmpquery)
 
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index a4246cd..8c3308f 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,16 +1,6 @@
 http://www.wikidata.org/entity/Q7960498
 http://www.wikidata.org/entity/Q692895
-http://www.wikidata.org/entity/Q928
 http://www.wikidata.org/entity/Q2722074
 http://www.wikidata.org/entity/Q25622187
 http://www.wikidata.org/entity/Q27684996
 http://www.wikidata.org/entity/Q2757125
-http://www.wikidata.org/entity/Q1922283
-http://www.wikidata.org/entity/Q490
-http://www.wikidata.org/entity/Q677037
-http://www.wikidata.org/entity/Q3037
-http://www.wikidata.org/entity/Q843
-http://www.wikidata.org/entity/Q183
-http://www.wikidata.org/entity/Q29
-http://www.wikidata.org/entity/Q17
-http://www.wikidata.org/entity/Q810
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 88e8be5..7539104 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -11,11 +11,13 @@ File containing information about the countries in our database. Additional info
 This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet:
 
 
->SELECT DISTINCT ?geoLocation  WHERE
->{
->?fasta ?x [ <<http://purl.obolibrary.org/obo/GAZ_00000448>> ?geoLocation] .
->FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
->}
+```sparql
+SELECT DISTINCT ?geoLocation  WHERE
+{
+    ?fasta ?x [ <http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+    FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+}
+```
 
 [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
 
-- 
cgit 1.4.1


From 6c654dd60f98d473ba94fda6143d8b8b00f99586 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 12:58:38 +0000
Subject: Add country entries that miss coordinates

---
 scripts/db_enrichment/country_enrichment.py |  2 +-
 semantic_enrichment/countries.ttl           | 30 ++++++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'scripts')

diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index f62a64e..37329fb 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -72,7 +72,6 @@ construct {
 } WHERE
 {
     BIND (XXX as ?a) .
-    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
     ?a wdt:P17 ?country.
     ?country rdfs:label ?country_label .
@@ -81,6 +80,7 @@ construct {
     FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
     FILTER (lang(?label)='en')
+    OPTIONAL { ?a wdt:P625 ?c }
 }
 """""
 
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
index 728877f..b0651cf 100644
--- a/semantic_enrichment/countries.ttl
+++ b/semantic_enrichment/countries.ttl
@@ -348,11 +348,6 @@
     ns1:P30 <http://www.wikidata.org/entity/Q538> ;
     ns1:P625 "Point(137.0 -28.0)" .
 
-<http://www.wikidata.org/entity/Q668> rdfs:label "India" ;
-    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
-    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
-    ns1:P625 "Point(83.0 22.8)" .
-
 <http://www.wikidata.org/entity/Q739> rdfs:label "Colombia" ;
     ns1:P17 <http://www.wikidata.org/entity/Q739> ;
     ns1:P30 <http://www.wikidata.org/entity/Q18> ;
@@ -1389,6 +1384,31 @@
     ns1:P30 <http://www.wikidata.org/entity/Q48> ;
     ns1:P625 "Point(123.0 12.0)" .
 
+<http://www.wikidata.org/entity/Q25622187> rdfs:label "Bayad" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(73.0 20.8)" .
+
+<http://www.wikidata.org/entity/Q2757125> rdfs:label "Choryasi Taluka" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(73.0 20.8)" .
+
+<http://www.wikidata.org/entity/Q27684996> rdfs:label "Daskroi" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P625 "Point(72.0 22.0)" .
+
+<http://www.wikidata.org/entity/Q692895> rdfs:label "Wairarapa" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+    ns1:P625 "Point(174.0 -41.2)" .
+
+<http://www.wikidata.org/entity/Q7960498> rdfs:label "Waitemata City" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+    ns1:P625 "Point(174.0 -41.2)" .
+
+<http://www.wikidata.org/entity/Q668> rdfs:label "India" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q668> ;
+    ns1:P30 <http://www.wikidata.org/entity/Q48> ;
+    ns1:P625 "Point(83.0 22.8)" .
+
 <http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" .
 <http://www.wikidata.org/entity/Q49> rdfs:label "North America" .
 <http://www.wikidata.org/entity/Q18> rdfs:label "South America" .
-- 
cgit 1.4.1


From cb2200839e180d518167bce06395ae04c332ddf4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 8 Nov 2020 10:35:03 +0000
Subject: UTHSC samples

---
 example/uthsc_example.yaml          |  4 ++--
 scripts/uthsc_samples/template.yaml | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'scripts')

diff --git a/example/uthsc_example.yaml b/example/uthsc_example.yaml
index 18188dc..956faf1 100644
--- a/example/uthsc_example.yaml
+++ b/example/uthsc_example.yaml
@@ -1,10 +1,10 @@
 id: placeholder
 
 license:
-    license_type: http://creativecommons.org/licenses/by/4.0/
+    license_type: https://creativecommons.org/licenses/by/4.0/
     title: "Sample"
     attribution_name: "Mariah Taylor, Colleen Jonsson"
-    attribution_url: https://www.uthsc.edu/rbl/
+    attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
 
 host:
     host_id: TN_UT2
diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml
index 1175ac8..07e0828 100644
--- a/scripts/uthsc_samples/template.yaml
+++ b/scripts/uthsc_samples/template.yaml
@@ -1,9 +1,9 @@
 id: placeholder
 
 license:
-    license_type: http://creativecommons.org/licenses/by/4.0/
-    title: "$sample_name - $locationx"
-    attribution_name: "Mariah Taylor, Colleen Jonsson"
+    license_type: https://creativecommons.org/licenses/by/4.0/
+    title: "$strain"
+    attribution_name: "Mariah Taylor, Colleen B. Jonsson"
     attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
 
 host:
@@ -23,10 +23,10 @@ virus:
 technology:
     sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
     sequence_assembly_method: https://bio.tools/BWA#!
-    additional_technology_information: Oxford Nanopore MiniIon RNA long reads
+    additional_technology_information: "Oxford Nanopore MiniIon RNA long reads"
 
 submitter:
-    authors: [Mariah Taylor, Colleen Jonsson]
+    authors: [Mariah Taylor, Colleen B. Jonsson]
     submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
     submitter_address: UTHSC, Memphis, Tennessee 38163, USA
     originating_lab: Regional Biocontainment Laboratory, Memphis, TN
-- 
cgit 1.4.1


From b311e2ec0f1d02cf16152855dd8bdd760ed4578b Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 8 Nov 2020 10:50:24 +0000
Subject: Uploaded UTHSC sequences

---
 scripts/uthsc_samples/uthsc_samples.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'scripts')

diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
index 3ad2561..54c70ee 100644
--- a/scripts/uthsc_samples/uthsc_samples.py
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -55,3 +55,5 @@ for index, row in table.iterrows():
                                    locationx=locationx,
                                    strain=strain
                                    ))
+
+          print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")
-- 
cgit 1.4.1


From 11812bbf95ddf1771a159b7ef6580a9179c0cad1 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 10 Nov 2020 05:02:57 -0600
Subject: virtuoso: Added a --no-cache option

---
 scripts/update_virtuoso/check_for_updates.py | 34 ++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'scripts')

diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py
index 8761c8a..a63f4d1 100755
--- a/scripts/update_virtuoso/check_for_updates.py
+++ b/scripts/update_virtuoso/check_for_updates.py
@@ -4,7 +4,7 @@
 #
 # You can run this in a Guix container with
 #
-#    ~/opt/guix/bin/guix environment -C guix --ad-hoc python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba
+#    ~/opt/guix/bin/guix environment -C guix --ad-hoc raptor2 python python-requests curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba
 #
 # Note you'll need to run from the root dir. Remove the ./cache.txt file if you want to force an update.
 #
@@ -19,6 +19,11 @@ fn = sys.argv[1]
 user = sys.argv[2]
 pwd = sys.argv[3]
 
+no_cache = False
+if fn == "--no-cache":
+  no_cache = True
+  print("Skipping cache check and download of metadata.ttl")
+
 scriptdir = os.path.dirname(os.path.realpath(__file__))
 print(scriptdir)
 basedir = os.path.dirname(os.path.dirname(scriptdir))
@@ -29,6 +34,15 @@ def upload(fn):
     # print("DELETE "+fn)
     # cmd = ("curl --digest --user dba:%s --verbose --url -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph -X DELETE" % pwd).split(" ")
 
+    print("VALIDATE "+fn)
+    cmd = f"rapper -i turtle {fn}"
+    print(cmd)
+    p = subprocess.Popen(cmd.split(" "),stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    if p.returncode != 0:
+      print(out,err)
+    assert(p.returncode == 0)
+
     print("UPLOADING "+fn)
     cmd = ("curl -X PUT --digest -u dba:%s -H Content-Type:text/turtle -T %s -G http://sparql.genenetwork.org/sparql-graph-crud-auth --data-urlencode graph=http://covid-19.genenetwork.org/graph/%s" % (pwd, fn, os.path.basename(fn)) )
     print(cmd)
@@ -39,6 +53,7 @@ def upload(fn):
 
 url = 'https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl'
 # --- Fetch headers from TTL file on Arvados
+#  curl --head https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl
 r = requests.head(url)
 print(r.headers)
 print(r.headers['Last-Modified'])
@@ -49,14 +64,14 @@ last_modified_str = r.headers['Last-Modified']
 t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" )
 print(t_stamp)
 
-# OK, it works, now check last stored value
+# OK, it works, now check last stored value in the cache
 stamp = None
 if os.path.isfile(fn):
     file = open(fn,"r")
     stamp = file.read()
     file.close
 
-if stamp != last_modified_str:
+if no_cache or stamp != last_modified_str:
     print("Delete graphs")
     for graph in ["labels.ttl", "metadata.ttl", "countries.ttl"]:
         cmd = ("curl --digest -u dba:%s --verbose --url http://127.0.0.1:8890/sparql-graph-crud-auth?graph=http://covid-19.genenetwork.org/graph/%s -X DELETE" % (pwd, graph))
@@ -69,12 +84,13 @@ if stamp != last_modified_str:
     upload(basedir+"/semantic_enrichment/labels.ttl")
     upload(basedir+"/semantic_enrichment/countries.ttl")
 
-    print("Fetch metadata TTL")
-    r = requests.get(url)
-    assert(r.status_code == 200)
-    with open("metadata.ttl", "w") as f:
-        f.write(r.text)
-        f.close
+    if not no_cache:
+        print("Fetch metadata TTL")
+        r = requests.get(url)
+        assert(r.status_code == 200)
+        with open("metadata.ttl", "w") as f:
+            f.write(r.text)
+            f.close
     upload("metadata.ttl")
     with open(fn,"w") as f:
         f.write(last_modified_str)
-- 
cgit 1.4.1


From 986bacf77191a159d842605f6bb86f3f92a3be54 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 10 Nov 2020 11:41:09 +0000
Subject: virtuoso: header change

---
 scripts/update_virtuoso/check_for_updates.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'scripts')

diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py
index a63f4d1..fb66c2e 100755
--- a/scripts/update_virtuoso/check_for_updates.py
+++ b/scripts/update_virtuoso/check_for_updates.py
@@ -56,13 +56,14 @@ url = 'https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mer
 #  curl --head https://download.lugli.arvadosapi.com/c=lugli-4zz18-z513nlpqm03hpca/_/mergedmetadata.ttl
 r = requests.head(url)
 print(r.headers)
-print(r.headers['Last-Modified'])
+if not no_cache:
+  print(r.headers['Last-Modified'])
 
-# --- Convert/validate time stamp
-# ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y'
-last_modified_str = r.headers['Last-Modified']
-t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" )
-print(t_stamp)
+  # --- Convert/validate time stamp
+  # ValueError: time data 'Tue, 21 Apr 2020 23:47:43 GMT' does not match format '%a %b %d %H:%M:%S %Y'
+  last_modified_str = r.headers['Last-Modified']
+  t_stamp = time.strptime(last_modified_str,"%a, %d %b %Y %H:%M:%S %Z" )
+  print(t_stamp)
 
 # OK, it works, now check last stored value in the cache
 stamp = None
-- 
cgit 1.4.1


From b3eb10770bada631c929fec83247f6fda7ef22a4 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 10 Nov 2020 06:39:10 -0600
Subject: virtuoso: no-cache

---
 scripts/update_virtuoso/check_for_updates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'scripts')

diff --git a/scripts/update_virtuoso/check_for_updates.py b/scripts/update_virtuoso/check_for_updates.py
index fb66c2e..939a575 100755
--- a/scripts/update_virtuoso/check_for_updates.py
+++ b/scripts/update_virtuoso/check_for_updates.py
@@ -93,7 +93,8 @@ if no_cache or stamp != last_modified_str:
             f.write(r.text)
             f.close
     upload("metadata.ttl")
-    with open(fn,"w") as f:
-        f.write(last_modified_str)
+    if not no_cache:
+        with open(fn,"w") as f:
+            f.write(last_modified_str)
 else:
     print("Metadata is up to date")
-- 
cgit 1.4.1