From 92fa550c6edfeb9acc9a5cbc31c0c272f8703898 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 29 May 2020 11:00:08 -0500 Subject: Blog stuff --- bh20simplewebuploader/main.py | 2 +- bh20simplewebuploader/static/blog.css | 44 ++ bh20simplewebuploader/templates/banner.html | 2 +- bh20simplewebuploader/templates/header.html | 5 +- doc/blog/using-covid-19-pubseq-part1.html | 637 ++++++++++++++++++++++++++++ doc/blog/using-covid-19-pubseq-part3.html | 44 +- doc/blog/using-covid-19-pubseq-part3.org | 24 +- 7 files changed, 742 insertions(+), 16 deletions(-) create mode 100644 bh20simplewebuploader/static/blog.css create mode 100644 doc/blog/using-covid-19-pubseq-part1.html diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index 847e36b..81fe571 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -451,7 +451,7 @@ def blog_page(): buf = None; if blog_content: buf = get_html_body('doc/blog/'+blog_content+'.html') - return render_template('blog.html',menu='BLOG',embed=buf) + return render_template('blog.html',menu='BLOG',embed=buf,blog=blog_content) @app.route('/about') diff --git a/bh20simplewebuploader/static/blog.css b/bh20simplewebuploader/static/blog.css new file mode 100644 index 0000000..145bada --- /dev/null +++ b/bh20simplewebuploader/static/blog.css @@ -0,0 +1,44 @@ +.title { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace } +.table-of-contents { font-family: monospace; color: red; } +/* .text-table-of-contents { font-family: monospace; color: black; font-size:80%; } */ +.timestamp { font-family: monospace; color: darkgreen; } + +h1,h2 { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace; color:black;background-color:#F0F8FF; } +h2,h3,h4 { color: darkblue; } +code { color: darkblue; } +body {font-family: Palatino, 'Palatino Linotype', serif; color:black;background-color:#e5d8f0; font-size: large } + +div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset; font-family: palatino font, monospace; font-size:80%; font-weight:bold; } +div.quote { font-family: palatino font, monospace; font-size:80%; } +div.quotation { font-family: palatino font, monospace; font-size:80%; } +pre.example { font-family: prestige, monospace; color:black; font-size:70%; background-color: lightyellow; } +pre.src { margin: 30px; font-family: prestige, monospace; font-weight: bold; color:white; font-size:80%; background-color: black; } + +div[id="text-table-of-contents"]{ + font-family: palatino font, monospace; background-color:white; + border-style: dotted; + border-color: #98bf21; + border-width: 1px; +} +div[class^="outline-text"] { + background-color:#ebe6f0; + border-style: dotted; + border-color: #98bf21; + border-width: 1px; + font-family: Palatino, 'Palatino Linotype', serif; color:black; font-size: large +} +span[class="todo TESTING"] { + color:purple; +} +span[class="todo IN_PROGRESS"] { + color:brown; +} +span[class^="section-number"] { + color:grey; +} +span[class="journal"] { + color:darkblue; +} +span[class="year"] { + color:darkred; +} diff --git a/bh20simplewebuploader/templates/banner.html b/bh20simplewebuploader/templates/banner.html index 3e2ee9d..8f2b09a 100644 --- a/bh20simplewebuploader/templates/banner.html +++ b/bh20simplewebuploader/templates/banner.html @@ -1,6 +1,6 @@
-

Web uploader for Public SARS-CoV-2 Sequence Resource

+

COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource

Database contains public sequences!

diff --git a/bh20simplewebuploader/templates/header.html b/bh20simplewebuploader/templates/header.html index 0c06f62..6e326ee 100644 --- a/bh20simplewebuploader/templates/header.html +++ b/bh20simplewebuploader/templates/header.html @@ -3,6 +3,9 @@ - Web uploader for Public SARS-CoV-2 Sequence Resource + COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource + {% if blog %} + + {% endif %} diff --git a/doc/blog/using-covid-19-pubseq-part1.html b/doc/blog/using-covid-19-pubseq-part1.html new file mode 100644 index 0000000..5e52b82 --- /dev/null +++ b/doc/blog/using-covid-19-pubseq-part1.html @@ -0,0 +1,637 @@ + + + + + + + +COVID-19 PubSeq (part 1) + + + + + + + +
+ UP + | + HOME +
+

COVID-19 PubSeq (part 1)

+ +

+As part of the COVID-19 Biohackathon 2020 we formed a working group +to create a COVID-19 Public Sequence Resource (COVID-19 PubSeq) for +Corona virus sequences. The general idea is to create a repository +that has a low barrier to entry for uploading sequence data using best +practices. I.e., data published with a creative commons 4.0 (CC-4.0) +license with metadata using state-of-the art standards and, perhaps +most importantly, providing standardised workflows that get triggered +on upload, so that results are immediately available in standardised +data formats. +

+ +
+

1 What does this mean?

+
+

+This means that when someone uploads a SARS-CoV-2 sequence using one +of our tools (CLI or web-based) they add some metadata which is +expressed in a schema that looks like +

+ +
+
- name: hostSchema
+  type: record
+  fields:
+    host_species:
+        doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens
+        type: string
+        jsonldPredicate:
+          _id: http://www.ebi.ac.uk/efo/EFO_0000532
+          _type: "@id"
+          noLinkCheck: true
+    host_sex:
+        doc: Sex of the host as defined in PATO, expect male () or female ()
+        type: string?
+        jsonldPredicate:
+          _id: http://purl.obolibrary.org/obo/PATO_0000047
+          _type: "@id"
+          noLinkCheck: true
+    host_age:
+        doc: Age of the host as number (e.g. 50)
+        type: int?
+        jsonldPredicate:
+          _id: http://purl.obolibrary.org/obo/PATO_0000011
+
+
+ +

+this metadata gets transformed into an RDF database which means +information can easily be fetched related to uploaded sequences. +We'll show an example below where we query a live database. +

+ +

+There is more: when a new sequence gets uploaded COVID-19 PubSeq kicks +in with a number of workflows running in the cloud. These workflows +generate a fresh variation graph (GFA) containing all sequences, an +RDF file containing metadata, and an RDF file containing the variation +graph in triples. Soon we will at multi sequence alignments (MSA) and +more. Anyone can contribute data, tools and workflows to this +initiative! +

+
+
+ + +
+

2 Fetch sequence data

+
+

+The latest run of the pipeline can be viewed here. Each of these +generated files can just be downloaded for your own use and sharing! +Data is published under a Creative Commons 4.0 attribution license +(CC-BY-4.0). This means that, unlike some other 'public' resources, +you can use this data in any way you want, provided the submitter gets +attributed. +

+ +

+If you download the GFA or FASTA sequences you'll find sequences are +named something like +keep:e17abc8a0269875ed4cfbff5d9897c6c+123/sequence.fasta which +refers to an internal Arvados Keep representation of the FASTA +sequence. Keep is content-addressable which means that the value +e17abc8a0269875ed4cfbff5d9897c6c uniquely identifies the file by its +contents. If the contents change, the identifier changes! We use +these identifiers throughout. +

+
+
+ +
+

3 Predicates

+
+

+To explore an RDF dataset, the first query we can do is open and gets +us a list. Lets look at all the predicates in the dataset by pasting +the following in a SPARQL end point +http://sparql.genenetwork.org/sparql/ +

+ +
+
select distinct ?p
+{
+   ?o ?p ?s
+}
+
+
+ +

+you can ignore the openlink and w3 ones. To reduce results to a named +graph set the default graph. +To get a list of graphs in the dataset, first do +

+ +
+
select distinct ?g
+{
+    GRAPH ?g {?s ?p ?o}
+}
+
+
+ +

+Limiting search to metadata add +http://covid-19.genenetwork.org/graph/metadata.ttl in the top input +box. Now you can find a predicate for submitter that looks like +http://biohackathon.org/bh20-seq-schema#MainSchema/submitter. +

+ +

+To list all submitters, try +

+ + + +

+Oh wait, it returns things like nodeID://b76150! That is not helpful, +these are anonymous nodes in the graph. These point to another triple +and by +

+ +
+
select distinct ?s
+{
+   ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?id .
+   ?id ?p ?s
+}
+
+
+ +

+you get a list of all submitters including "University of Washington, +Seattle, WA 98109, USA". +

+ +

+To lift the full URL out of the query you can use a header like +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?submitter
+{
+   ?dataset pubseq:submitter ?id .
+   ?id ?p ?submitter
+}
+
+
+ +

+which reads a bit better. We can also see the submitted sequences. One +of them submitted by University of Washington is +http://collections.lugli.arvadosapi.com/c=030bcb8fda7f19743157359f5855f7a6+126/sequence.fasta +(note the ID may have changed so pick one with above query). +To see the submitted metadata replace sequence.fasta with metadata.yaml +http://collections.lugli.arvadosapi.com/c=030bcb8fda7f19743157359f5855f7a6+126/metadata.yaml +

+ +

+Now we got this far, lets count the datasets submitted with +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select (COUNT(distinct ?dataset) as ?num)
+{
+   ?dataset pubseq:submitter ?id .
+   ?id ?p ?submitter
+}
+
+
+
+
+ + +
+

4 Fetch submitter info and other metadata

+
+

+To get dataests with submitters we can do the above +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?p ?submitter
+{
+   ?dataset pubseq:submitter ?id .
+   ?id ?p ?submitter
+}
+
+
+ +

+Tells you one submitter is "Roychoudhury,P.;Greninger,A.;Jerome,K." +with a URL predicate (http://purl.obolibrary.org/obo/NCIT_C42781) +explaining "The individual who is responsible for the content of a +document." Well formed URIs point to real information about the URI +itself. Welcome to the power of the semantic web. +

+ +

+Let's focus on one sample with +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?submitter
+{
+   ?dataset pubseq:submitter ?id .
+   ?id ?p ?submitter .
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+}
+
+
+ +

+That is a lot of samples! We just want to pick one, so let's +see if we can get a sample ID by listing sample predicates +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?p
+{
+   ?dataset ?p ?o .
+   ?dataset pubseq:submitter ?id .
+}
+
+
+ +

+which lists a predicate named +http://biohackathon.org/bh20-seq-schema#MainSchema/sample. +Let's zoom in on those of Roychoudhury with +

+ + +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?sid ?sample ?p1 ?dataset ?submitter
+{
+   ?dataset pubseq:submitter ?id .
+   ?id ?p ?submitter .
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+   ?dataset pubseq:sample ?sid .
+   ?sid ?p1 ?sample
+}
+
+
+ +

+which shows pretty much everything known about their submissions in +this database. Let's focus on one sample "MT326090.1" with predicate +http://semanticscience.org/resource/SIO_000115. +

+ +
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+PREFIX sio: <http://semanticscience.org/resource/>
+select distinct ?sample ?p ?o
+{
+   ?sample sio:SIO_000115 "MT326090.1" .
+   ?sample ?p ?o .
+}
+
+
+ +

+This query tells us the sample was submitted "2020-03-21" and +originates from http://www.wikidata.org/entity/Q30, i.e., the USA and +is a biospecimen collected from the back of the throat by swabbing. +We can track it back to the original GenBank submission. +

+ +

+We have also added country and label data to make it a bit easier +to view/query the database. +

+
+
+ +
+

5 Fetch all sequences from Washington state

+
+

+Now we know how to get at the origin we can do it the other way round +and fetch all sequences referring to Washington state +

+ + + +

+which lists 300 sequences originating from Washington state! Which is almost +half of the set coming out of GenBank. +

+
+
+ +
+

6 Discussion

+
+

+The public sequence uploader collects sequences, raw data and +(machine) queriable metadata. Not only that: data gets analyzed in the +pangenome and results are presented immediately. The data can be +referenced in publications and origins are citeable. +

+
+
+ +
+

7 Acknowledgements

+
+

+The overall effort was due to magnificent freely donated input by a +great number of people. I particularly want to thank Thomas Liener for +the great effort he made with the ontology group in getting ontology's +and schema sorted! Peter Amstutz and Arvados/Curii helped build the +on-demand compute and back-ends. Thanks also to Michael Crusoe for +supporting the Common Workflow Language initiative. And without Erik +Garrison this initiative would not have existed! +

+
+
+
+
+
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-05-29 Fri 10:12
. +
+ + diff --git a/doc/blog/using-covid-19-pubseq-part3.html b/doc/blog/using-covid-19-pubseq-part3.html index ac32717..7903791 100644 --- a/doc/blog/using-covid-19-pubseq-part3.html +++ b/doc/blog/using-covid-19-pubseq-part3.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - + COVID-19 PubSeq Uploading Data (part 3) @@ -248,14 +248,16 @@ for the JavaScript code in this tag.

Table of Contents

-
-

1 Uploading Data

+
+

1 Uploading Data

Work in progress! @@ -263,12 +265,38 @@ for the JavaScript code in this tag.

-
-

2 What does this mean?

+
+

2 Introduction

+
+

+The COVID-19 PubSeq allows you to upload your SARS-Cov-2 strains to a +public resource for global comparisons. Compute it triggered on +upload. Read the ABOUT page for more information. +

+
+
+ +
+

3 Step 1: Sequence

+
+

+We start with an assembled or mapped sequence in FASTA format. The +PubSeq uploader contains a QC step which checks whether it is a likely +SARS-CoV-2 sequence. While PubSeq deduplicates sequences and never +overwrites metadata it probably pays to check whether your data +already is in the system by querying some metadata as described in +Query metadata with SPARQL. +

+
+
+ + +
+

4 Step 2: Metadata

-
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-05-27 Wed 07:41
. +
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-05-29 Fri 10:00
.
diff --git a/doc/blog/using-covid-19-pubseq-part3.org b/doc/blog/using-covid-19-pubseq-part3.org index 1cd2db1..296bef6 100644 --- a/doc/blog/using-covid-19-pubseq-part3.org +++ b/doc/blog/using-covid-19-pubseq-part3.org @@ -13,10 +13,24 @@ * Table of Contents :TOC:noexport: - [[#uploading-data][Uploading Data]] - - [[#table-of-contents][Table of Contents]] - - [[#what-does-this-mean][What does this mean?]] + - [[#introduction][Introduction]] + - [[#step-1-sequence][Step 1: Sequence]] + - [[#step-2-metadata][Step 2: Metadata]] + +* Introduction + +The COVID-19 PubSeq allows you to upload your SARS-Cov-2 strains to a +public resource for global comparisons. Compute it triggered on +upload. Read the [[./about][ABOUT]] page for more information. + +* Step 1: Sequence + +We start with an assembled or mapped sequence in FASTA format. The +PubSeq uploader contains a [[https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/qc_fasta.py][QC step]] which checks whether it is a likely +SARS-CoV-2 sequence. While PubSeq deduplicates sequences and never +overwrites metadata it probably pays to check whether your data +already is in the system by querying some metadata as described in +[[./blog?id=using-covid-19-pubseq-part1][Query metadata with SPARQL]]. -* Table of Contents :TOC:noexport: - - [[#what-does-this-mean][What does this mean?]] -* What does this mean? +* Step 2: Metadata -- cgit v1.2.3