From fbbec51e604964d18ab72cbf0ac24b102ecc0376 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 07:45:10 +0000
Subject: Working on upload

---
 doc/blog/using-covid-19-pubseq-part3.org | 161 ++++++++++++++++++++-----------
 1 file changed, 107 insertions(+), 54 deletions(-)

(limited to 'doc/blog/using-covid-19-pubseq-part3.org')
diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org
index fb68251..f3ba073 100644
--- a/doc/blog/using-covid-19-pubseq-part3.org
+++ b/doc/blog/using-covid-19-pubseq-part3.org
@@ -7,10 +7,19 @@
 #+HTML_HEAD: <link rel="Blog stylesheet" type="text/css" href="blog.css" />
 #+OPTIONS: ^:nil
 
+* Introduction
+
+In this document we explain how to upload data into COVID-19 PubSeq.
+This can happen through a web page, or through a command line
+script. We'll also show how to parametrize uploads by using templates.
+The procedure is much easier than with other repositories and can be
+fully automated. Once uploaded you can use our export API to prepare
+for other repositories.
 
 
 * Table of Contents                                                     :TOC:noexport:
- - [[#uploading-data][Uploading Data]]
+ - [[#introduction][Introduction]]
+ - [[#uploading-data][Uploading data]]
  - [[#step-1-upload-sequence][Step 1: Upload sequence]]
  - [[#step-2-add-metadata][Step 2: Add metadata]]
    - [[#obligatory-fields][Obligatory fields]]
@@ -23,7 +32,7 @@
    - [[#example-uploading-bulk-genbank-sequences][Example: uploading bulk GenBank sequences]]
    - [[#example-preparing-metadata][Example: preparing metadata]]
 
-* Uploading Data
+* Uploading data
 
 The COVID-19 PubSeq allows you to upload your SARS-Cov-2 strains to a
 public resource for global comparisons. A recompute of the pangenome
@@ -165,55 +174,90 @@ file an associated metadata in [[https://github.com/arvados/bh20-seq-resource/bl
 the web form and gets validated from the same [[https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml][schema]] looks. The YAML
 that you need to create/generate for your samples looks like
 
+A minimal example of metadata looks like
+
+#+begin_src json
+  id: placeholder
+
+  license:
+      license_type: http://creativecommons.org/licenses/by/4.0/
+
+  host:
+      host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+  sample:
+      sample_id: XX
+      collection_date: "2020-01-01"
+      collection_location: http://www.wikidata.org/entity/Q148
+
+  virus:
+      virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+
+  technology:
+      sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+
+  submitter:
+      authors: [John Doe]
+#+end_src
+
+a more elaborate example (note most fields are optional) may look like
+
 #+begin_src json
-id: placeholder
-
-host:
-    host_id: XX1
-    host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
-    host_sex: http://purl.obolibrary.org/obo/PATO_0000384
-    host_age: 20
-    host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
-    host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
-    host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
-    host_vaccination: [vaccines1,vaccine2]
-    ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
-    additional_host_information: Optional free text field for additional information
-
-sample:
-    sample_id: Id of the sample as defined by the submitter
-    collector_name: Name of the person that took the sample
-    collecting_institution: Institute that was responsible of sampling
-    specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
-    collection_date: "2020-01-01"
-    collection_location: http://www.wikidata.org/entity/Q148
-    sample_storage_conditions: frozen specimen
-    source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
-    additional_collection_information: Optional free text field for additional information
-
-virus:
-    virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
-    virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
-
-technology:
-    sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
-    sequence_assembly_method: Protocol used for assembly
-    sequencing_coverage: [70.0, 100.0]
-    additional_technology_information: Optional free text field for additional information
-
-submitter:
-    authors: [John Doe, Joe Boe, Jonny Oe]
-    submitter_name: [John Doe]
-    submitter_address: John Doe's address
-    originating_lab: John Doe kitchen
-    lab_address: John Doe's address
-    provider_sample_id: XXX1
-    submitter_sample_id: XXX2
-    publication: PMID00001113
-    submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
-    additional_submitter_information: Optional free text field for additional information
+  id: placeholder
+
+  host:
+      host_id: XX1
+      host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+      host_sex: http://purl.obolibrary.org/obo/PATO_0000384
+      host_age: 20
+      host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
+      host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
+      host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
+      host_vaccination: [vaccines1,vaccine2]
+      ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
+      additional_host_information: Optional free text field for additional information
+
+  sample:
+      sample_id: Id of the sample as defined by the submitter
+      collector_name: Name of the person that took the sample
+      collecting_institution: Institute that was responsible of sampling
+      specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
+      collection_date: "2020-01-01"
+      collection_location: http://www.wikidata.org/entity/Q148
+      sample_storage_conditions: frozen specimen
+      source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
+      additional_collection_information: Optional free text field for additional information
+
+  virus:
+      virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+      virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
+
+  technology:
+      sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
+      sequence_assembly_method: Protocol used for assembly
+      sequencing_coverage: [70.0, 100.0]
+      additional_technology_information: Optional free text field for additional information
+
+  submitter:
+      authors: [John Doe, Joe Boe, Jonny Oe]
+      submitter_name: [John Doe]
+      submitter_address: John Doe's address
+      originating_lab: John Doe kitchen
+      lab_address: John Doe's address
+      provider_sample_id: XXX1
+      submitter_sample_id: XXX2
+      publication: PMID00001113
+      submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
+      additional_submitter_information: Optional free text field for additional information
 #+end_src
 
+more metadata is yummy when stored in RDF. [[https://yummydata.org/][Yummydata]] is useful to a wider community. Note
+that many of the terms in above example are URIs, such as
+host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606.  We use
+web ontologies for these to make the data less ambiguous and more
+FAIR. Check out the option fields as defined in the schema. If it is not listed
+a little bit of web searching may be required or [[./contact][contact]] us.
+
 ** Run the uploader (CLI)
 
 Installing with pip you should be
@@ -221,7 +265,6 @@ able to run
 
 : bh20sequploader sequence.fasta metadata.yaml
 
-
 Alternatively the script can be installed from [[https://github.com/arvados/bh20-seq-resource#installation][github]]. Run on the
 command line
 
@@ -274,13 +317,23 @@ done
 
 ** Example: preparing metadata
 
-Usually, metadata are available in tabular format, like spreadsheets. As an example, we provide a script
-[[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to show you how to parse
-your metadata in YAML files ready for the upload. To execute the script, go in the ~bh20-seq-resource/scripts/esr_samples
-and execute
+Usually, metadata are available in a tabular format, such as
+spreadsheets. As an example, we provide a script [[https://github.com/arvados/bh20-seq-resource/tree/master/scripts/esr_samples][esr_samples.py]] to
+show you how to parse your metadata in YAML files ready for the
+upload. To execute the script, go in the
+~bh20-seq-resource/scripts/esr_samples and execute
 
 #+BEGIN_SRC sh
 python3 esr_samples.py
 #+END_SRC
 
-You will find the YAML files in the `yaml` folder which will be created in the same directory.
+You will find the YAML files in the `yaml` folder which will be
+created in the same directory.
+
+In the example we use Python pandas to read the spreadsheet into a
+tabular structure. Next we use a [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/esr_samples/template.yaml][template.yaml]] file that gets filled
+in by ~esr_samples.py~ so we get a metadata YAML file for each sample.
+
+Next run the earlier CLI uploader for each YAML and FASTA combination.
+It can't be much easier than this. For ESR we uploaded a batch of 600
+sequences this way writing a few lines of Python [[https://github.com/arvados/bh20-seq-resource/blob/master/scripts/esr_samples/esr_samples.py][code]]. See [[http://covid19.genenetwork.org/resource/20VR0995][example]].
-- 
cgit 1.4.1


From 43d7264dda8061a024befbc9ca0a89d7159b1e40 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 6 Nov 2020 09:52:32 +0000
Subject: UTHSC upload info

---
 doc/blog/using-covid-19-pubseq-part3.org |  3 +-
 scripts/uthsc_samples/.gitignore         |  1 +
 scripts/uthsc_samples/template.yaml      | 35 ++++++++++++++++++++
 scripts/uthsc_samples/uthsc_samples.py   | 57 ++++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 scripts/uthsc_samples/.gitignore
 create mode 100644 scripts/uthsc_samples/template.yaml
 create mode 100644 scripts/uthsc_samples/uthsc_samples.py

(limited to 'doc/blog/using-covid-19-pubseq-part3.org')

diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org
index f3ba073..d0d6c7f 100644
--- a/doc/blog/using-covid-19-pubseq-part3.org
+++ b/doc/blog/using-covid-19-pubseq-part3.org
@@ -255,7 +255,8 @@ more metadata is yummy when stored in RDF. [[https://yummydata.org/][Yummydata]]
 that many of the terms in above example are URIs, such as
 host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606.  We use
 web ontologies for these to make the data less ambiguous and more
-FAIR. Check out the option fields as defined in the schema. If it is not listed
+FAIR. Check out the option fields as defined in the schema. If it is not listed,
+check the [[https://github.com/arvados/bh20-seq-resource/blob/master/semantic_enrichment/labels.ttl][labels.ttl]] file. Also,
 a little bit of web searching may be required or [[./contact][contact]] us.
 
 ** Run the uploader (CLI)
diff --git a/scripts/uthsc_samples/.gitignore b/scripts/uthsc_samples/.gitignore
new file mode 100644
index 0000000..8786e3f
--- /dev/null
+++ b/scripts/uthsc_samples/.gitignore
@@ -0,0 +1 @@
+yaml
diff --git a/scripts/uthsc_samples/template.yaml b/scripts/uthsc_samples/template.yaml
new file mode 100644
index 0000000..1175ac8
--- /dev/null
+++ b/scripts/uthsc_samples/template.yaml
@@ -0,0 +1,35 @@
+id: placeholder
+
+license:
+    license_type: http://creativecommons.org/licenses/by/4.0/
+    title: "$sample_name - $locationx"
+    attribution_name: "Mariah Taylor, Colleen Jonsson"
+    attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
+
+host:
+    host_id: "$sample_id"
+    host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
+
+sample:
+    sample_id: "$sample_id"
+    specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
+    collection_date: "$collection_date"
+    collection_location: $location
+
+virus:
+    virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
+    virus_strain: "$strain"
+
+technology:
+    sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
+    sequence_assembly_method: https://bio.tools/BWA#!
+    additional_technology_information: Oxford Nanopore MiniIon RNA long reads
+
+submitter:
+    authors: [Mariah Taylor, Colleen Jonsson]
+    submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
+    submitter_address: UTHSC, Memphis, Tennessee 38163, USA
+    originating_lab: Regional Biocontainment Laboratory, Memphis, TN
+    provider_sample_id: $sample_id
+    submitter_sample_id: $sample_id
+    submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
diff --git a/scripts/uthsc_samples/uthsc_samples.py b/scripts/uthsc_samples/uthsc_samples.py
new file mode 100644
index 0000000..5c39398
--- /dev/null
+++ b/scripts/uthsc_samples/uthsc_samples.py
@@ -0,0 +1,57 @@
+import os
+import pandas as pd
+from string import Template
+from dateutil.parser import parse
+import re
+
+import sys
+
+# Metadata in tabular format in a spreadsheet(?!)
+xlsx = '../../test/data/10_samples.xlsx'
+
+# Template in a text file
+template_yaml = 'template.yaml'
+
+dir_output = 'yaml'
+
+if not os.path.exists(dir_output):
+    os.makedirs(dir_output)
+
+table = pd.read_excel(xlsx)
+
+print(table)
+
+for index, row in table.iterrows():
+    sample = row['Sample ID']
+    print(f"Processing sample {sample}...")
+
+    with open(template_yaml) as f:
+      text = Template(f.read())
+      with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
+          sample_id = sample
+          sample_name = sample
+          collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
+          locationx = row['City']+", "+row['State']+", USA"
+          location = "https://www.wikidata.org/wiki/Q16563" # Memphis by default
+          map = {
+              "Pegram": "https://www.wikidata.org/wiki/Q3289517",
+              "Alexander": "https://www.wikidata.org/wiki/Q79663",
+              "Smithville": "https://www.wikidata.org/wiki/Q2145339",
+              "Nashville": "https://www.wikidata.org/wiki/Q23197",
+              "Madison": "https://www.wikidata.org/wiki/Q494755"
+              }
+
+          for name in map:
+              p = re.compile(name)
+              if p.match(locationx):
+                  location = map[name]
+                  break
+
+          strain = f"SARS-CoV-2/human/USA/{sample}/2020"
+          fw.write(text.substitute(sample_id=sample_id,
+                                   sample_name=sample_name,
+                                   collection_date=collection_date,
+                                   location=location,
+                                   locationx=locationx,
+                                   strain=strain
+                                   ))
-- 
cgit 1.4.1