From 4ec57900569c360151e8fd36649a035fba0a9869 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Wed, 4 Nov 2020 15:29:45 +0000
Subject: virtuoso-ose: install and settings

---
 doc/INSTALL.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'doc/INSTALL.md')
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index df825c6..0180a4b 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -67,3 +67,26 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$  env GUIX_PACKAGE_PATH=~/i
 ```
 
 Note: see above on GUIX_PACKAGE_PATH.
+
+
+## Run Virtuoso-ose
+
+Guix has a package for virtuoso-ose we use
+
+    guix package -i virtuoso-ose -p ~/opt/virtuoso
+
+Create a data dir
+
+    mkdir -p /export/virtuoso/var/lib/virtuoso/db
+    chown $USER /export/virtuoso/var/lib/virtuoso/db
+
+Add an ini file
+
+    cp ~/opt/virtuoso/var/lib/virtuoso/db/virtuoso.ini .config/
+
+And run from the data dir
+
+    cd /export/virtuoso/var/lib/virtuoso/db
+    guix environment --ad-hoc virtuoso-ose -- virtuoso-t -f
+
+Visit http://localhost:8890/sparql
-- 
cgit v1.2.3


From fbbec51e604964d18ab72cbf0ac24b102ecc0376 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 07:45:10 +0000
Subject: Working on upload

---
 doc/INSTALL.md                            |   5 +
 doc/blog/using-covid-19-pubseq-part3.html | 261 +++++++++++++++++++-----------
 doc/blog/using-covid-19-pubseq-part3.org  | 161 +++++++++++-------
 3 files changed, 277 insertions(+), 150 deletions(-)

(limited to 'doc/INSTALL.md')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 0180a4b..96cf1d4 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$  env GUIX_PACKAGE_PATH=~/i
 
 Note: see above on GUIX_PACKAGE_PATH.
 
+## Run the tests
+
+    guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev
+    . ~/opt/python-dev/etc/profile
+
 
 ## Run Virtuoso-ose
 
diff --git a/doc/blog/using-covid-19-pubseq-part3.html b/doc/blog/using-covid-19-pubseq-part3.html
index 788c1d2..b49830b 100644
--- a/doc/blog/using-covid-19-pubseq-part3.html
+++ b/doc/blog/using-covid-19-pubseq-part3.html
@@ -3,7 +3,7 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2020-10-27 Tue 06:43 -->
+<!-- 2020-11-05 Thu 07:28 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>COVID-19 PubSeq Uploading Data (part 3)</title>
@@ -224,52 +224,66 @@
 <h2>Table of Contents</h2>
 <div id="text-table-of-contents">
 <ul>
-<li><a href="#orga9eabf3">1. Uploading Data</a></li>
-<li><a href="#org643e745">2. Step 1: Upload sequence</a></li>
-<li><a href="#org0874b9f">3. Step 2: Add metadata</a>
+<li><a href="#org85998fd">1. Introduction</a></li>
+<li><a href="#orge783233">2. Uploading data</a></li>
+<li><a href="#orgc5810d7">3. Step 1: Upload sequence</a></li>
+<li><a href="#org5a4ae99">4. Step 2: Add metadata</a>
 <ul>
-<li><a href="#orgaaa44f2">3.1. Obligatory fields</a>
+<li><a href="#orga9824de">4.1. Obligatory fields</a>
 <ul>
-<li><a href="#orgf38cdbf">3.1.1. Sample ID (sample_id)</a></li>
-<li><a href="#org34b5b06">3.1.2. Collection date</a></li>
-<li><a href="#org221f1cf">3.1.3. Collection location</a></li>
-<li><a href="#org75d1dad">3.1.4. Sequencing technology</a></li>
-<li><a href="#org990e897">3.1.5. Authors</a></li>
+<li><a href="#org407fde2">4.1.1. Sample ID (sample_id)</a></li>
+<li><a href="#orgee3bb35">4.1.2. Collection date</a></li>
+<li><a href="#org123bf0c">4.1.3. Collection location</a></li>
+<li><a href="#org41b1f83">4.1.4. Sequencing technology</a></li>
+<li><a href="#org9bab62e">4.1.5. Authors</a></li>
 </ul>
 </li>
-<li><a href="#org959072e">3.2. Optional fields</a>
+<li><a href="#org7071af8">4.2. Optional fields</a>
 <ul>
-<li><a href="#org561b754">3.2.1. Host information</a></li>
-<li><a href="#org774a993">3.2.2. Collecting institution</a></li>
-<li><a href="#orgcf096cf">3.2.3. Specimen source</a></li>
-<li><a href="#orgeac0fd8">3.2.4. Source database accession</a></li>
-<li><a href="#org3c0aebd">3.2.5. Strain name</a></li>
+<li><a href="#org2a04fdb">4.2.1. Host information</a></li>
+<li><a href="#orgc4084bc">4.2.2. Collecting institution</a></li>
+<li><a href="#orge552325">4.2.3. Specimen source</a></li>
+<li><a href="#org2577e1f">4.2.4. Source database accession</a></li>
+<li><a href="#org0305fb3">4.2.5. Strain name</a></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><a href="#org9f09957">4. Step 3: Submit to COVID-19 PubSeq</a>
+<li><a href="#orgdf67705">5. Step 3: Submit to COVID-19 PubSeq</a>
 <ul>
-<li><a href="#org25372da">4.1. Trouble shooting</a></li>
+<li><a href="#orgd33218c">5.1. Trouble shooting</a></li>
 </ul>
 </li>
-<li><a href="#org8d1b4ad">5. Step 4: Check output</a></li>
-<li><a href="#orgd86b3dc">6. Bulk sequence uploader</a>
+<li><a href="#orgbf4cd0f">6. Step 4: Check output</a></li>
+<li><a href="#orgc8d6fa4">7. Bulk sequence uploader</a>
 <ul>
-<li><a href="#orgc4aa7a1">6.1. Run the uploader (CLI)</a></li>
-<li><a href="#org46687b5">6.2. Example: uploading bulk GenBank sequences</a></li>
-<li><a href="#orgbc228bc">6.3. Example: preparing metadata</a></li>
+<li><a href="#org338ebf7">7.1. Run the uploader (CLI)</a></li>
+<li><a href="#org46d5e2f">7.2. Example: uploading bulk GenBank sequences</a></li>
+<li><a href="#orgbfc3f90">7.3. Example: preparing metadata</a></li>
 </ul>
 </li>
 </ul>
 </div>
 </div>
 
+<div id="outline-container-org85998fd" class="outline-2">
+<h2 id="org85998fd"><span class="section-number-2">1</span> Introduction</h2>
+<div class="outline-text-2" id="text-1">
+<p>
+In this document we explain how to upload data into COVID-19 PubSeq.
+This can happen through a web page, or through a command line
+script. We'll also show how to parametrize uploads by using templates.
+The procedure is much easier than with other repositories and can be
+fully automated. Once uploaded you can use our export API to prepare
+for other repositories.
+</p>
+</div>
+</div>
 
 
-<div id="outline-container-orga9eabf3" class="outline-2">
-<h2 id="orga9eabf3"><span class="section-number-2">1</span> Uploading Data</h2>
-<div class="outline-text-2" id="text-1">
+<div id="outline-container-orge783233" class="outline-2">
+<h2 id="orge783233"><span class="section-number-2">2</span> Uploading data</h2>
+<div class="outline-text-2" id="text-2">
 <p>
 The COVID-19 PubSeq allows you to upload your SARS-Cov-2 strains to a
 public resource for global comparisons. A recompute of the pangenome
@@ -278,9 +292,9 @@ gets triggered on upload. Read the <a href="./about">ABOUT</a> page for more inf
 </div>
 </div>
 
-<div id="outline-container-org643e745" class="outline-2">
-<h2 id="org643e745"><span class="section-number-2">2</span> Step 1: Upload sequence</h2>
-<div class="outline-text-2" id="text-2">
+<div id="outline-container-orgc5810d7" class="outline-2">
+<h2 id="orgc5810d7"><span class="section-number-2">3</span> Step 1: Upload sequence</h2>
+<div class="outline-text-2" id="text-3">
 <p>
 To upload a sequence in the <a href="http://covid19.genenetwork.org/">web upload page</a> hit the browse button and
 select the FASTA file on your local hard disk.
@@ -307,9 +321,9 @@ an improved pangenome.
 </div>
 </div>
 
-<div id="outline-container-org0874b9f" class="outline-2">
-<h2 id="org0874b9f"><span class="section-number-2">3</span> Step 2: Add metadata</h2>
-<div class="outline-text-2" id="text-3">
+<div id="outline-container-org5a4ae99" class="outline-2">
+<h2 id="org5a4ae99"><span class="section-number-2">4</span> Step 2: Add metadata</h2>
+<div class="outline-text-2" id="text-4">
 <p>
 The <a href="./">web upload page</a> contains fields for adding metadata. Metadata is
 not only important for attribution, is also important for
@@ -334,13 +348,13 @@ the web form. Here we add some extra information.
 </p>
 </div>
 
-<div id="outline-container-orgaaa44f2" class="outline-3">
-<h3 id="orgaaa44f2"><span class="section-number-3">3.1</span> Obligatory fields</h3>
-<div class="outline-text-3" id="text-3-1">
+<div id="outline-container-orga9824de" class="outline-3">
+<h3 id="orga9824de"><span class="section-number-3">4.1</span> Obligatory fields</h3>
+<div class="outline-text-3" id="text-4-1">
 </div>
-<div id="outline-container-orgf38cdbf" class="outline-4">
-<h4 id="orgf38cdbf"><span class="section-number-4">3.1.1</span> Sample ID (sample_id)</h4>
-<div class="outline-text-4" id="text-3-1-1">
+<div id="outline-container-org407fde2" class="outline-4">
+<h4 id="org407fde2"><span class="section-number-4">4.1.1</span> Sample ID (sample_id)</h4>
+<div class="outline-text-4" id="text-4-1-1">
 <p>
 This is a string field that defines a unique sample identifier by the
 submitter. In addition to sample_id we also have host_id,
@@ -357,18 +371,18 @@ Here we add the GenBank ID MT536190.1.
 </div>
 </div>
 
-<div id="outline-container-org34b5b06" class="outline-4">
-<h4 id="org34b5b06"><span class="section-number-4">3.1.2</span> Collection date</h4>
-<div class="outline-text-4" id="text-3-1-2">
+<div id="outline-container-orgee3bb35" class="outline-4">
+<h4 id="orgee3bb35"><span class="section-number-4">4.1.2</span> Collection date</h4>
+<div class="outline-text-4" id="text-4-1-2">
 <p>
 Estimated collection date. The GenBank page says April 6, 2020.
 </p>
 </div>
 </div>
 
-<div id="outline-container-org221f1cf" class="outline-4">
-<h4 id="org221f1cf"><span class="section-number-4">3.1.3</span> Collection location</h4>
-<div class="outline-text-4" id="text-3-1-3">
+<div id="outline-container-org123bf0c" class="outline-4">
+<h4 id="org123bf0c"><span class="section-number-4">4.1.3</span> Collection location</h4>
+<div class="outline-text-4" id="text-4-1-3">
 <p>
 A search on wikidata says Los Angeles is
 <a href="https://www.wikidata.org/entity/Q65">https://www.wikidata.org/entity/Q65</a>
@@ -376,18 +390,18 @@ A search on wikidata says Los Angeles is
 </div>
 </div>
 
-<div id="outline-container-org75d1dad" class="outline-4">
-<h4 id="org75d1dad"><span class="section-number-4">3.1.4</span> Sequencing technology</h4>
-<div class="outline-text-4" id="text-3-1-4">
+<div id="outline-container-org41b1f83" class="outline-4">
+<h4 id="org41b1f83"><span class="section-number-4">4.1.4</span> Sequencing technology</h4>
+<div class="outline-text-4" id="text-4-1-4">
 <p>
 GenBank entry says Illumina, so we can fill that in
 </p>
 </div>
 </div>
 
-<div id="outline-container-org990e897" class="outline-4">
-<h4 id="org990e897"><span class="section-number-4">3.1.5</span> Authors</h4>
-<div class="outline-text-4" id="text-3-1-5">
+<div id="outline-container-org9bab62e" class="outline-4">
+<h4 id="org9bab62e"><span class="section-number-4">4.1.5</span> Authors</h4>
+<div class="outline-text-4" id="text-4-1-5">
 <p>
 GenBank entry says 'Lamers,S., Nolan,D.J., Rose,R., Cross,S., Moraga
 Amador,D., Yang,T., Caruso,L., Navia,W., Von Borstel,L., Hui Zhou,X.,
@@ -397,17 +411,17 @@ Freehan,A. and Garcia-Diaz,J.', so we can fill that in.
 </div>
 </div>
 
-<div id="outline-container-org959072e" class="outline-3">
-<h3 id="org959072e"><span class="section-number-3">3.2</span> Optional fields</h3>
-<div class="outline-text-3" id="text-3-2">
+<div id="outline-container-org7071af8" class="outline-3">
+<h3 id="org7071af8"><span class="section-number-3">4.2</span> Optional fields</h3>
+<div class="outline-text-3" id="text-4-2">
 <p>
 All other fields are optional. But let's see what we can add.
 </p>
 </div>
 
-<div id="outline-container-org561b754" class="outline-4">
-<h4 id="org561b754"><span class="section-number-4">3.2.1</span> Host information</h4>
-<div class="outline-text-4" id="text-3-2-1">
+<div id="outline-container-org2a04fdb" class="outline-4">
+<h4 id="org2a04fdb"><span class="section-number-4">4.2.1</span> Host information</h4>
+<div class="outline-text-4" id="text-4-2-1">
 <p>
 Sadly, not much is known about the host from GenBank. A little
 sleuthing renders an interesting paper by some of the authors titled
@@ -420,27 +434,27 @@ did to the person and what the person was like (say age group).
 </div>
 </div>
 
-<div id="outline-container-org774a993" class="outline-4">
-<h4 id="org774a993"><span class="section-number-4">3.2.2</span> Collecting institution</h4>
-<div class="outline-text-4" id="text-3-2-2">
+<div id="outline-container-orgc4084bc" class="outline-4">
+<h4 id="orgc4084bc"><span class="section-number-4">4.2.2</span> Collecting institution</h4>
+<div class="outline-text-4" id="text-4-2-2">
 <p>
 We can fill that in.
 </p>
 </div>
 </div>
 
-<div id="outline-container-orgcf096cf" class="outline-4">
-<h4 id="orgcf096cf"><span class="section-number-4">3.2.3</span> Specimen source</h4>
-<div class="outline-text-4" id="text-3-2-3">
+<div id="outline-container-orge552325" class="outline-4">
+<h4 id="orge552325"><span class="section-number-4">4.2.3</span> Specimen source</h4>
+<div class="outline-text-4" id="text-4-2-3">
 <p>
 We have that: nasopharyngeal swab
 </p>
 </div>
 </div>
 
-<div id="outline-container-orgeac0fd8" class="outline-4">
-<h4 id="orgeac0fd8"><span class="section-number-4">3.2.4</span> Source database accession</h4>
-<div class="outline-text-4" id="text-3-2-4">
+<div id="outline-container-org2577e1f" class="outline-4">
+<h4 id="org2577e1f"><span class="section-number-4">4.2.4</span> Source database accession</h4>
+<div class="outline-text-4" id="text-4-2-4">
 <p>
 Genbank which is <a href="http://identifiers.org/insdc/MT536190.1#sequence">http://identifiers.org/insdc/MT536190.1#sequence</a>.
 Note we plug in our own identifier MT536190.1.
@@ -448,9 +462,9 @@ Note we plug in our own identifier MT536190.1.
 </div>
 </div>
 
-<div id="outline-container-org3c0aebd" class="outline-4">
-<h4 id="org3c0aebd"><span class="section-number-4">3.2.5</span> Strain name</h4>
-<div class="outline-text-4" id="text-3-2-5">
+<div id="outline-container-org0305fb3" class="outline-4">
+<h4 id="org0305fb3"><span class="section-number-4">4.2.5</span> Strain name</h4>
+<div class="outline-text-4" id="text-4-2-5">
 <p>
 SARS-CoV-2/human/USA/LA-BIE-070/2020
 </p>
@@ -459,9 +473,9 @@ SARS-CoV-2/human/USA/LA-BIE-070/2020
 </div>
 </div>
 
-<div id="outline-container-org9f09957" class="outline-2">
-<h2 id="org9f09957"><span class="section-number-2">4</span> Step 3: Submit to COVID-19 PubSeq</h2>
-<div class="outline-text-2" id="text-4">
+<div id="outline-container-orgdf67705" class="outline-2">
+<h2 id="orgdf67705"><span class="section-number-2">5</span> Step 3: Submit to COVID-19 PubSeq</h2>
+<div class="outline-text-2" id="text-5">
 <p>
 Once you have the sequence and the metadata together, hit
 the 'Add to Pangenome' button. The data will be checked,
@@ -470,9 +484,9 @@ submitted and the workflows should kick in!
 </div>
 
 
-<div id="outline-container-org25372da" class="outline-3">
-<h3 id="org25372da"><span class="section-number-3">4.1</span> Trouble shooting</h3>
-<div class="outline-text-3" id="text-4-1">
+<div id="outline-container-orgd33218c" class="outline-3">
+<h3 id="orgd33218c"><span class="section-number-3">5.1</span> Trouble shooting</h3>
+<div class="outline-text-3" id="text-5-1">
 <p>
 We got an error saying: {"stem": "<a href="http://www.wikidata.org/entity/">http://www.wikidata.org/entity/</a>",&#x2026;
 which means that our location field was not formed correctly!  After
@@ -485,9 +499,9 @@ submit button.
 </div>
 </div>
 
-<div id="outline-container-org8d1b4ad" class="outline-2">
-<h2 id="org8d1b4ad"><span class="section-number-2">5</span> Step 4: Check output</h2>
-<div class="outline-text-2" id="text-5">
+<div id="outline-container-orgbf4cd0f" class="outline-2">
+<h2 id="orgbf4cd0f"><span class="section-number-2">6</span> Step 4: Check output</h2>
+<div class="outline-text-2" id="text-6">
 <p>
 The current pipeline takes 5.5 hours to complete! Once it completes
 the updated data can be checked on the <a href="./download">DOWNLOAD</a> page. After completion
@@ -497,9 +511,9 @@ in.
 </div>
 </div>
 
-<div id="outline-container-orgd86b3dc" class="outline-2">
-<h2 id="orgd86b3dc"><span class="section-number-2">6</span> Bulk sequence uploader</h2>
-<div class="outline-text-2" id="text-6">
+<div id="outline-container-orgc8d6fa4" class="outline-2">
+<h2 id="orgc8d6fa4"><span class="section-number-2">7</span> Bulk sequence uploader</h2>
+<div class="outline-text-2" id="text-7">
 <p>
 Above steps require a manual upload of one sequence with metadata.
 What if you have a number of sequences you want to upload in bulk?
@@ -510,6 +524,39 @@ the web form and gets validated from the same <a href="https://github.com/arvado
 that you need to create/generate for your samples looks like
 </p>
 
+<p>
+A minimal example of metadata looks like
+</p>
+
+<div class="org-src-container">
+<pre class="src src-json">id: placeholder
+
+license:
+    license_type: http://creativecommons.org/licenses/by/<span style="color: #8bc34a;">4.0</span>/
+
+host:
+    host_species: http://purl.obolibrary.org/obo/NCBITaxon_<span style="color: #8bc34a;">9606</span>
+
+sample:
+    sample_id: XX
+    collection_date: <span style="color: #9ccc65;">"2020-01-01"</span>
+    collection_location: http://www.wikidata.org/entity/Q<span style="color: #8bc34a;">148</span>
+
+virus:
+    virus_species: http://purl.obolibrary.org/obo/NCBITaxon_<span style="color: #8bc34a;">2697049</span>
+
+technology:
+    sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_<span style="color: #8bc34a;">0008632</span>]
+
+submitter:
+    authors: [John Doe]
+</pre>
+</div>
+
+<p>
+a more elaborate example (note most fields are optional) may look like
+</p>
+
 <div class="org-src-container">
 <pre class="src src-json">id: placeholder
 
@@ -559,11 +606,20 @@ submitter:
     additional_submitter_information: Optional free text field for additional information
 </pre>
 </div>
+
+<p>
+more metadata is yummy. <a href="https://yummydata.org/">Yummydata</a> is useful to a wider community. Note
+that many of the terms in above example are URIs, such as
+host_species: <a href="http://purl.obolibrary.org/obo/NCBITaxon_9606">http://purl.obolibrary.org/obo/NCBITaxon_9606</a>.  We use
+web ontologies for these to make the data less ambiguous and more
+FAIR. Check out the option fields as defined in the schema. If it is not listed
+a little bit of web searching may be required or <a href="./contact">contact</a> us.
+</p>
 </div>
 
-<div id="outline-container-orgc4aa7a1" class="outline-3">
-<h3 id="orgc4aa7a1"><span class="section-number-3">6.1</span> Run the uploader (CLI)</h3>
-<div class="outline-text-3" id="text-6-1">
+<div id="outline-container-org338ebf7" class="outline-3">
+<h3 id="org338ebf7"><span class="section-number-3">7.1</span> Run the uploader (CLI)</h3>
+<div class="outline-text-3" id="text-7-1">
 <p>
 Installing with pip you should be
 able to run
@@ -574,7 +630,6 @@ bh20sequploader sequence.fasta metadata.yaml
 </pre>
 
 
-
 <p>
 Alternatively the script can be installed from <a href="https://github.com/arvados/bh20-seq-resource#installation">github</a>. Run on the
 command line
@@ -617,9 +672,9 @@ The web interface using this exact same script so it should just work
 </div>
 
 
-<div id="outline-container-org46687b5" class="outline-3">
-<h3 id="org46687b5"><span class="section-number-3">6.2</span> Example: uploading bulk GenBank sequences</h3>
-<div class="outline-text-3" id="text-6-2">
+<div id="outline-container-org46d5e2f" class="outline-3">
+<h3 id="org46d5e2f"><span class="section-number-3">7.2</span> Example: uploading bulk GenBank sequences</h3>
+<div class="outline-text-3" id="text-7-2">
 <p>
 We also use above script to bulk upload GenBank sequences with a <a href="https://github.com/arvados/bh20-seq-resource/blob/master/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py">FASTA
 and YAML</a> extractor specific for GenBank. This means that the steps we
@@ -645,14 +700,15 @@ ls $<span style="color: #ffcc80;">dir_fasta_and_yaml</span>/*.yaml | <span style
 </div>
 
 
-<div id="outline-container-orgbc228bc" class="outline-3">
-<h3 id="orgbc228bc"><span class="section-number-3">6.3</span> Example: preparing metadata</h3>
-<div class="outline-text-3" id="text-6-3">
+<div id="outline-container-orgbfc3f90" class="outline-3">
+<h3 id="orgbfc3f90"><span class="section-number-3">7.3</span> Example: preparing metadata</h3>
+<div class="outline-text-3" id="text-7-3">
 <p>
-Usually, metadata are available in tabular format, like spreadsheets. As an example, we provide a script
-<a href="https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples">esr_samples.py</a> to show you how to parse
-your metadata in YAML files ready for the upload. To execute the script, go in the ~bh20-seq-resource/scripts/esr_samples
-and execute
+Usually, metadata are available in a tabular format, such as
+spreadsheets. As an example, we provide a script <a href="https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples">esr_samples.py</a> to
+show you how to parse your metadata in YAML files ready for the
+upload. To execute the script, go in the
+~bh20-seq-resource/scripts/esr_samples and execute
 </p>
 
 <div class="org-src-container">
@@ -661,14 +717,27 @@ and execute
 </div>
 
 <p>
-You will find the YAML files in the `yaml` folder which will be created in the same directory.
+You will find the YAML files in the `yaml` folder which will be
+created in the same directory.
+</p>
+
+<p>
+In the example we use Python pandas to read the spreadsheet into a
+tabular structure. Next we use a <a href="https://github.com/arvados/bh20-seq-resource/blob/master/scripts/esr_samples/template.yaml">template.yaml</a> file that gets filled
+in by <code>esr_samples.py</code> so we get a metadata YAML file for each sample.
+</p>
+
+<p>
+Next run the earlier CLI uploader for each YAML and FASTA combination.
+It can't be much easier than this. For ESR we uploaded a batch of 600
+sequences this way. See <a href="http://covid19.genenetwork.org/resource/20VR0995">example</a>.
 </p>
 </div>
 </div>
 </div>
 </div>
 <div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-10-27 Tue 06:43</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-11-05 Thu 07:27</small>.
 </div>
 </body>
 </html>
diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org
index fb68251..f3ba073 100644
--- a/doc/blog/using-covid-19-pubseq-part3.org
+++ b/doc/blog/using-covid-19-pubseq-part3.org
@@ -7,10 +7,19 @@
 #+HTML_HEAD: <link rel="Blog stylesheet" type="text/css" href="blog.css" />
 #+OPTIONS: ^:nil
 
+* Introduction
+
+In this document we explain how to upload data into COVID-19 PubSeq.
+This can happen through a web page, or through a command line
+script. We'll also show how to parametrize uploads by using templates.
+The procedure is much easier than with other repositories and can be
+fully automated. Once uploaded you can use our export API to prepare
+for other repositories.
 
 
 * Table of Contents                                                     :TOC:noexport:
- - [[#uploading-data][Uploading Data]]
+ - [[#introduction][Introduction]]
+ - [[#uploading-data][Uploading data]]
  - [[#step-1-upload-sequence][Step 1: Upload sequence]]
  - [[#step-2-add-metadata][Step 2: Add metadata]]
    - [[#obligatory-fields][Obligatory fields]]
@@ -23,7 +32,7 @@
    - [[#example-uploading-bulk-genbank-sequences][Example: uploading bulk GenBank sequences]]
    - [[#example-preparing-metadata][Example: preparing metadata]]
 
-* Uploading Data
+* Uploading data
 
 The COVID-19 PubSeq allows you to upload your SARS-Cov-2 strains to a
 public resource for global comparisons. A recompute of the pangenome
@@ -165,55 +174,90 @@ file an associated metadata in [[https://github.com/arvados/bh20-seq-resource/bl
 the web form and gets validated from the same [[https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml][schema]] looks. The YAML
 that you need to create/generate for your samples looks like
 
+A minimal example of metadata looks like
+
+#+begin_src json
+  id: placeholder
+
+  license:
+      license_type: http://creativecommons.org/licenses/by/4.0/
+
+  host:
+      host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+  sample:
+      sample_id: XX
+      collection_date: "2020-01-01"
+      collection_location: http://www.wikidata.org/entity/Q148
+
+  virus:
+      virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+
+  technology:
+      sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+
+  submitter:
+      authors: [John Doe]
+#+end_src
+
+a more elaborate example (note most fields are optional) may look like
+
 #+begin_src json
-id: placeholder
-
-host:
-    host_id: XX1
-    host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
-    host_sex: http://purl.obolibrary.org/obo/PATO_0000384
-    host_age: 20
-    host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
-    host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
-    host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
-    host_vaccination: [vaccines1,vaccine2]
-    ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
-    additional_host_information: Optional free text field for additional information
-
-sample:
-    sample_id: Id of the sample as defined by the submitter
-    collector_name: Name of the person that took the sample
-    collecting_institution: Institute that was responsible of sampling
-    specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
-    collection_date: "2020-01-01"
-    collection_location: http://www.wikidata.org/entity/Q148
-    sample_storage_conditions: frozen specimen
-    source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
-    additional_collection_information: Optional free text field for additional information
-
-virus:
-    virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
-    virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
-
-technology:
-    sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
-    sequence_assembly_method: Protocol used for assembly
-    sequencing_coverage: [70.0, 100.0]
-    additional_technology_information: Optional free text field for additional information
-
-submitter:
-    authors: [John Doe, Joe Boe, Jonny Oe]
-    submitter_name: [John Doe]
-    submitter_address: John Doe's address
-    originating_lab: John Doe kitchen
-    lab_address: John Doe's address
-    provider_sample_id: XXX1
-    submitter_sample_id: XXX2
-    publication: PMID00001113
-    submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
-    additional_submitter_information: Optional free text field for additional information
+  id: placeholder
+
+  host:
+      host_id: XX1
+      host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+      host_sex: http://purl.obolibrary.org/obo/PATO_0000384
+      host_age: 20
+      host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
+      host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
+      host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
+      host_vaccination: [vaccines1,vaccine2]
+      ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
+      additional_host_information: Optional free text field for additional information
+
+  sample:
+      sample_id: Id of the sample as defined by the submitter
+      collector_name: Name of the person that took the sample
+      collecting_institution: Institute that was responsible of sampling
+      specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
+      collection_date: "2020-01-01"
+      collection_location: http://www.wikidata.org/entity/Q148
+      sample_storage_conditions: frozen specimen
+      source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
+      additional_collection_information: Optional free text field for additional information
+
+  virus:
+      virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+      virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
+
+  technology:
+      sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
+      sequence_assembly_method: Protocol used for assembly
+      sequencing_coverage: [70.0, 100.0]
+      additional_technology_information: Optional free text field for additional information
+
+  submitter:
+      authors: [John Doe, Joe Boe, Jonny Oe]
+      submitter_name: [John Doe]
+      submitter_address: John Doe's address
+      originating_lab: John Doe kitchen
+      lab_address: John Doe's address
+      provider_sample_id: XXX1
+      submitter_sample_id: XXX2
+      publication: PMID00001113
+      submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
+      additional_submitter_information: Optional free text field for additional information
 #+end_src
 
+more metadata is yummy when stored in RDF. [[https://yummydata.org/][Yummydata]] is useful to a wider community. Note
+that many of the terms in above example are URIs, such as
+host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606.  We use
+web ontologies for these to make the data less ambiguous and more
+FAIR. Check out the option fields as defined in the schema. If it is not listed
+a little bit of web searching may be required or [[./contact][contact]] us.
+
 ** Run the uploader (CLI)
 
 Installing with pip you should be
@@ -221,7 +265,6 @@ able to run
 
 : bh20sequploader sequence.fasta metadata.yaml
 
-
 Alternatively the script can be installed from [[https://github.com/arvados/bh20-seq-resource#installation][github]]. Run on the
 command line
 
@@ -274,13 +317,23 @@ done
 
 ** Example: preparing metadata
 
-Usually, metadata are available in tabular format, like spreadsheets. As an example, we provide a script
-[[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to show you how to parse
-your metadata in YAML files ready for the upload. To execute the script, go in the ~bh20-seq-resource/scripts/esr_samples
-and execute
+Usually, metadata are available in a tabular format, such as
+spreadsheets. As an example, we provide a script [[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to
+show you how to parse your metadata in YAML files ready for the
+upload. To execute the script, go in the
+~bh20-seq-resource/scripts/esr_samples and execute
 
 #+BEGIN_SRC sh
 python3 esr_samples.py
 #+END_SRC
 
-You will find the YAML files in the `yaml` folder which will be created in the same directory.
+You will find the YAML files in the `yaml` folder which will be
+created in the same directory.
+
+In the example we use Python pandas to read the spreadsheet into a
+tabular structure. Next we use a [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/esr_samples/template.yaml][template.yaml]] file that gets filled
+in by ~esr_samples.py~ so we get a metadata YAML file for each sample.
+
+Next run the earlier CLI uploader for each YAML and FASTA combination.
+It can't be much easier than this. For ESR we uploaded a batch of 600
+sequences this way writing a few lines of Python [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/esr_samples/esr_samples.py][code]]. See [[http://covid19.genenetwork.org/resource/20VR0995][example]].
-- 
cgit v1.2.3


From d75f1c74fbf86652b02520de6ed46c981cf27e50 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 10:13:05 +0000
Subject: Adding Tennessee items

---
 doc/INSTALL.md                              |  5 ++++
 scripts/db_enrichment/.gitignore            |  1 +
 scripts/db_enrichment/country_enrichment.py | 43 +++++++++++++++++------------
 scripts/db_enrichment/input_location.csv    |  7 +++--
 scripts/db_enrichment/readme.md             |  2 ++
 semantic_enrichment/countries.ttl           | 18 ++++++++++++
 6 files changed, 56 insertions(+), 20 deletions(-)
 create mode 100644 scripts/db_enrichment/.gitignore

(limited to 'doc/INSTALL.md')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 96cf1d4..f54c8f2 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -68,6 +68,11 @@ penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$  env GUIX_PACKAGE_PATH=~/i
 
 Note: see above on GUIX_PACKAGE_PATH.
 
+## Run country semantic enrichment script
+
+    cd bh20-seq-resource/scripts/db_enrichment
+    guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py
+
 ## Run the tests
 
     guix package -i python-requests python-pandas python-jinja2 python -p ~/opt/python-dev
diff --git a/scripts/db_enrichment/.gitignore b/scripts/db_enrichment/.gitignore
new file mode 100644
index 0000000..30b159b
--- /dev/null
+++ b/scripts/db_enrichment/.gitignore
@@ -0,0 +1 @@
+enriched_output.txt
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 8dcf5f2..1f99d42 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -1,3 +1,12 @@
+# This script by @LLTommy queries the main SPARQL end point to find what
+# collections are missing country information for GPS coordinates, such
+#
+# <http://www.wikidata.org/entity/Q657004> rdfs:label "Canterbury Region" ;
+#    ns1:P17 <http://www.wikidata.org/entity/Q664> ;
+#    ns1:P625 "Point(172.0 -43.6)" .
+#
+# See also the ./readme.md
+
 import requests
 import csv
 from rdflib import Graph, Literal, RDF, URIRef
@@ -30,30 +39,28 @@ def callSPARQL(query):
 
 g = Graph()
 
-
-
 query = """
 construct {
-    ?a wdt:P625 ?c. 
+    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label . 
-    ?country wdt:P30 ?continent. 
-    ?continent rdfs:label ?continent_label   
-} WHERE 
-{ 
-    BIND (XXX as ?a) . 
-    ?a wdt:P625 ?c. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent.
+    ?continent rdfs:label ?continent_label
+} WHERE
+{
+    BIND (XXX as ?a) .
+    ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
-    ?a wdt:P17 ?country.      
-    ?country rdfs:label ?country_label .    
-    ?country wdt:P30 ?continent. 
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    ?country wdt:P30 ?continent.
     ?continent rdfs:label ?continent_label
-    FILTER (lang(?continent_label)='en')           
+    FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
-    FILTER (lang(?label)='en') 
+    FILTER (lang(?label)='en')
 
-}  
+}
 """""
 
 outputFile = 'input_location.csv'
@@ -88,4 +95,4 @@ with open(outputFile, 'r') as csvfile:
             raise
 
 print(g.serialize(format='n3').decode("utf-8"))
-g.serialize(destination='enriched_ouput.txt', format='turtle')
\ No newline at end of file
+g.serialize(destination='enriched_output.txt', format='turtle')
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index 364afc8..eb5322a 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,2 +1,5 @@
-http://www.wikidata.org/entity/Q111904
-http://www.wikidata.org/entity/Q1070
\ No newline at end of file
+http://www.wikidata.org/entity/Q3289517
+http://www.wikidata.org/entity/Q79663
+http://www.wikidata.org/entity/Q2145339
+http://www.wikidata.org/entity/Q23197
+http://www.wikidata.org/entity/Q494755
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 55ec496..88e8be5 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -17,5 +17,7 @@ This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countrie
 >FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
 >}
 
+[Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
+
 - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
 - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
diff --git a/semantic_enrichment/countries.ttl b/semantic_enrichment/countries.ttl
index 08e9c38..fe50b16 100644
--- a/semantic_enrichment/countries.ttl
+++ b/semantic_enrichment/countries.ttl
@@ -1328,7 +1328,25 @@
     ns1:P17 <http://www.wikidata.org/entity/Q79> ;
     ns1:P625 "Point(31.239444444 30.056111111)" .
 
+<http://www.wikidata.org/entity/Q2145339> rdfs:label "Smithville" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-85.820833333 35.957222222)" .
+
+<http://www.wikidata.org/entity/Q23197> rdfs:label "Nashville" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-86.783888888 36.165)" .
+
+<http://www.wikidata.org/entity/Q3289517> rdfs:label "Pegram" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-87.051666666 36.101666666)" .
+
+<http://www.wikidata.org/entity/Q494755> rdfs:label "Madison County" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-88.84 35.61)" .
 
+<http://www.wikidata.org/entity/Q79663> rdfs:label "Alexander City" ;
+    ns1:P17 <http://www.wikidata.org/entity/Q30> ;
+    ns1:P625 "Point(-85.936008 32.933157)" .
 
 <http://www.wikidata.org/entity/Q538> rdfs:label "Oceania" .
 <http://www.wikidata.org/entity/Q49> rdfs:label "North America" .
-- 
cgit v1.2.3


From 5fdfece97fb2d50a10eab5004a6467ec0097ece8 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 11:19:28 +0000
Subject: Uploader script improvements

---
 bh20sequploader/main.py     | 5 +++--
 bh20sequploader/qc_fasta.py | 9 +++++----
 doc/INSTALL.md              | 8 +++++++-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'doc/INSTALL.md')

diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f89b458..ea0fa70 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -49,7 +49,7 @@ sequence for enough overlap with the reference genome
                 failed = True
     except Exception as e:
         log.exception("Failed metadata QC")
-        failed = True
+        failed = True # continue with the FASTA checker
 
     target = []
     try:
@@ -64,13 +64,14 @@ sequence for enough overlap with the reference genome
             target[1] = ("reads_2."+target[1][0][6:], target[1][1], target[1][2])
 
         if do_qc and target[0][2] == 'text/fasta' and sample_id != target[0][1]:
-            raise ValueError("The sample_id field in the metadata must be the same as the FASTA header")
+            raise ValueError(f"The sample_id field in the metadata ({sample_id}) must be the same as the FASTA header ({target[0][1]})")
 
     except Exception as e:
         log.exception("Failed sequence QC")
         failed = True
 
     if failed:
+        log.debug("Bailing out!")
         exit(1)
 
     return target
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index f567f0a..814fb3e 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -66,7 +66,8 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
 
                     similarity = 0
                     try:
-                        cmd = ["minimap2", "-c -x asm20", tmp1.name, tmp2.name]
+                        log.debug("Trying to run minimap2")
+                        cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name]
                         logging.info("QC checking similarity to reference")
                         logging.info(" ".join(cmd))
                         result = subprocess.run(cmd, stdout=subprocess.PIPE)
@@ -83,9 +84,7 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
 
                     if similarity < 70.0:
                         raise ValueError(
-                            "QC fail for {}: alignment to reference was less than 70%% (was %2.2f%%)".format(
-                                seqlabel, similarity
-                            ))
+                            f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})")
 
         return "sequence.fasta" + gz, seqlabel, seq_type
     elif seq_type == "text/fastq":
@@ -93,4 +92,6 @@ def qc_fasta(arg_sequence, check_with_mimimap2=True):
         sequence.detach()
         return "reads.fastq" + gz, seqlabel, seq_type
     else:
+        log.debug(seqlabel)
+        log.debug(seq_type)
         raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence))
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index f54c8f2..45aca0f 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -31,7 +31,7 @@ arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-clien
 3. Run the tool directly with
 
 ```sh
-guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex -- python3 bh20sequploader/main.py example/sequence.fasta example/maximum_metadata_example.yaml
+guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex -- python3 bh20sequploader/main.py example/maximum_metadata_example.yaml example/sequence.fasta
 ```
 
 Note that python-pyshex is packaged in
@@ -44,6 +44,12 @@ repository. E.g.
 env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/opt/guix/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic  nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp python3 bh20sequploader/main.py --help
 ```
 
+Latest successful Guix run
+
+```sh
+env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs python-pyshex python-arvados-python-client python-schema-salad minimap2 -- python3 bh20sequploader/main.py  scripts/uthsc_samples/yaml/AL_UT14.yaml scripts/uthsc_samples/yaml/AL_UT14.fa
+```
+
 ### Using the Web Uploader
 
 To run the web uploader in a GNU Guix environment/container run it with something like
-- 
cgit v1.2.3


From 7c74a20b90ca647ca387eff2ed830c22f5ba1282 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 12:48:00 +0000
Subject: Country trouble shooting

---
 doc/INSTALL.md                              |  1 +
 scripts/db_enrichment/country_enrichment.py | 29 ++++++++++++++++++++++++++---
 scripts/db_enrichment/input_location.csv    | 10 ----------
 scripts/db_enrichment/readme.md             | 12 +++++++-----
 4 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'doc/INSTALL.md')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 45aca0f..367b452 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -77,6 +77,7 @@ Note: see above on GUIX_PACKAGE_PATH.
 ## Run country semantic enrichment script
 
     cd bh20-seq-resource/scripts/db_enrichment
+    edit input_location.csv
     guix environment guix --ad-hoc git python nss-certs python-rdflib -- python3 country_enrichment.py
 
 ## Run the tests
diff --git a/scripts/db_enrichment/country_enrichment.py b/scripts/db_enrichment/country_enrichment.py
index 1f99d42..f62a64e 100644
--- a/scripts/db_enrichment/country_enrichment.py
+++ b/scripts/db_enrichment/country_enrichment.py
@@ -39,14 +39,36 @@ def callSPARQL(query):
 
 g = Graph()
 
+test_query="""
+# Use with https://query.wikidata.org/
+SELECT DISTINCT ?a ?label ?country ?continent ?coor WHERE {
+    BIND (XXX as ?a) .
+    OPTIONAL {
+        ?a wdt:P625 ?coor.
+    }
+    ?a rdfs:label ?label .
+    ?a wdt:P17 ?country.
+    ?country rdfs:label ?country_label .
+    OPTIONAL {
+        ?country wdt:P30 ?continent.
+        ?continent rdfs:label ?continent_label
+        FILTER (lang(?continent_label)='en')
+    }
+    FILTER (lang(?country_label)='en')
+    FILTER (lang(?label)='en')
+}
+"""
+
+# wdt:P625 are GEO coordinates
+
 query = """
 construct {
     ?a wdt:P625 ?c.
     ?a rdfs:label ?label .
     ?a wdt:P17 ?country.
     ?country rdfs:label ?country_label .
-    ?country wdt:P30 ?continent.
-    ?continent rdfs:label ?continent_label
+    ?country wdt:P30 ?continent .
+    ?continent rdfs:label ?continent_label .
 } WHERE
 {
     BIND (XXX as ?a) .
@@ -59,7 +81,6 @@ construct {
     FILTER (lang(?continent_label)='en')
     FILTER (lang(?country_label)='en')
     FILTER (lang(?label)='en')
-
 }
 """""
 
@@ -72,6 +93,8 @@ with open(outputFile, 'r') as csvfile:
         counter=counter+1
 
         try:
+            testq = test_query.replace("XXX", "<"+row[0]+">")
+            print(testq)
             tmpquery=query.replace("XXX", "<"+row[0]+">")
             print(tmpquery)
 
diff --git a/scripts/db_enrichment/input_location.csv b/scripts/db_enrichment/input_location.csv
index a4246cd..8c3308f 100644
--- a/scripts/db_enrichment/input_location.csv
+++ b/scripts/db_enrichment/input_location.csv
@@ -1,16 +1,6 @@
 http://www.wikidata.org/entity/Q7960498
 http://www.wikidata.org/entity/Q692895
-http://www.wikidata.org/entity/Q928
 http://www.wikidata.org/entity/Q2722074
 http://www.wikidata.org/entity/Q25622187
 http://www.wikidata.org/entity/Q27684996
 http://www.wikidata.org/entity/Q2757125
-http://www.wikidata.org/entity/Q1922283
-http://www.wikidata.org/entity/Q490
-http://www.wikidata.org/entity/Q677037
-http://www.wikidata.org/entity/Q3037
-http://www.wikidata.org/entity/Q843
-http://www.wikidata.org/entity/Q183
-http://www.wikidata.org/entity/Q29
-http://www.wikidata.org/entity/Q17
-http://www.wikidata.org/entity/Q810
diff --git a/scripts/db_enrichment/readme.md b/scripts/db_enrichment/readme.md
index 88e8be5..7539104 100644
--- a/scripts/db_enrichment/readme.md
+++ b/scripts/db_enrichment/readme.md
@@ -11,11 +11,13 @@ File containing information about the countries in our database. Additional info
 This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet:
 
 
->SELECT DISTINCT ?geoLocation  WHERE
->{
->?fasta ?x [ <<http://purl.obolibrary.org/obo/GAZ_00000448>> ?geoLocation] .
->FILTER NOT EXISTS {?geoLocation <<http://www.w3.org/2000/01/rdf-schema#label>> ?geoLocation_tmp_label}
->}
+```sparql
+SELECT DISTINCT ?geoLocation  WHERE
+{
+    ?fasta ?x [ <http://purl.obolibrary.org/obo/GAZ_00000448> ?geoLocation] .
+    FILTER NOT EXISTS {?geoLocation <http://www.w3.org/2000/01/rdf-schema#label> ?geoLocation_tmp_label}
+}
+```
 
 [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
 
-- 
cgit v1.2.3


From 14afd33de16c0ee0705ce2cfdb06b3aff4e1b22e Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Tue, 10 Nov 2020 11:23:40 +0000
Subject: Virtuoso: run script

---
 doc/INSTALL.md | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'doc/INSTALL.md')

diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 367b452..0367c63 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -107,3 +107,9 @@ And run from the data dir
     guix environment --ad-hoc virtuoso-ose -- virtuoso-t -f
 
 Visit http://localhost:8890/sparql
+
+To update the turtle files do
+
+    guix environment -C guix --ad-hoc python python-requests raptor2 curl --network -- python3 ./scripts/update_virtuoso/check_for_updates.py cache.txt dba dba
+
+where dba is the default password.
-- 
cgit v1.2.3