From c69046ee9a5e24eadcd8cb885633328b0fd88011 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 17 Jul 2020 11:06:33 +0100 Subject: Update generated docs --- doc/blog/using-covid-19-pubseq-part1.html | 192 +++++++++++++++-------------- doc/blog/using-covid-19-pubseq-part4.html | 44 +++++-- doc/blog/using-covid-19-pubseq-part5.html | 194 ++++++++++++++++++++++++++---- 3 files changed, 305 insertions(+), 125 deletions(-) diff --git a/doc/blog/using-covid-19-pubseq-part1.html b/doc/blog/using-covid-19-pubseq-part1.html index 1959fac..0e6136c 100644 --- a/doc/blog/using-covid-19-pubseq-part1.html +++ b/doc/blog/using-covid-19-pubseq-part1.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- +This means that when someone uploads a SARS-CoV-2 sequence using one @@ -274,24 +274,24 @@ expressed in a 9606 for Homo sapiens + doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens type: string jsonldPredicate: - _id: http://www.ebi.ac.uk/efo/EFO_0000532 - _type: "@id" - noLinkCheck: true + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + _type: "@id" + noLinkCheck: true host_sex: - doc: Sex of the host as defined in PATO, expect male () or female () + doc: Sex of the host as defined in PATO, expect male () or female () type: string? jsonldPredicate: - _id: http://purl.obolibrary.org/obo/PATO_0000047 - _type: "@id" - noLinkCheck: true + _id: http://purl.obolibrary.org/obo/PATO_0000047 + _type: "@id" + noLinkCheck: true host_age: - doc: Age of the host as number (e.g. 50) + doc: Age of the host as number (e.g. 50) type: int? jsonldPredicate: - _id: http://purl.obolibrary.org/obo/PATO_0000011 + _id: http://purl.obolibrary.org/obo/PATO_0000011
The latest run of the pipeline can be viewed here. Each of these @@ -339,8 +339,8 @@ these identifiers throughout.
To explore an RDF dataset, the first query we can do is open and gets @@ -350,10 +350,10 @@ the following in a SPARQL end point
select distinct ?p
-{
+select distinct ?p
+{
?o ?p ?s
-}
+}
select distinct ?g -{ - GRAPH ?g {?s ?p ?o} -} +select distinct ?g +{ + GRAPH ?g {?s ?p ?o} +}
select distinct ?s -{ - ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?s -} +select distinct ?s +{ + ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?s +}
select distinct ?s
-{
- ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?id .
+select distinct ?s
+{
+ ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?id .
?id ?p ?s
-}
+}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?dataset ?submitter
-{
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-}
+}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select (COUNT(distinct ?dataset) as ?num)
-{
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select (COUNT(distinct ?dataset) as ?num)
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-}
+}
To get dataests with submitters we can do the above
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?dataset ?p ?submitter
-{
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?p ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-}
+}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> -select distinct ?dataset ?submitter -{ +PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> +select distinct ?dataset ?submitter +{ ?dataset pubseq:submitter ?id . ?id ?p ?submitter . - FILTER(CONTAINS(?submitter,"Roychoudhury")) . -} + FILTER(CONTAINS(?submitter,"Roychoudhury")) . +}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?p
-{
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?p
+{
?dataset ?p ?o .
?dataset pubseq:submitter ?id .
-}
+}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> -select distinct ?sid ?sample ?p1 ?dataset ?submitter -{ +PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/> +select distinct ?sid ?sample ?p1 ?dataset ?submitter +{ ?dataset pubseq:submitter ?id . ?id ?p ?submitter . - FILTER(CONTAINS(?submitter,"Roychoudhury")) . + FILTER(CONTAINS(?submitter,"Roychoudhury")) . ?dataset pubseq:sample ?sid . ?sid ?p1 ?sample -} +}
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-PREFIX sio: <http://semanticscience.org/resource/>
-select distinct ?sample ?p ?o
-{
+PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+PREFIX sio: <http://semanticscience.org/resource/>
+select distinct ?sample ?p ?o
+{
?sample sio:SIO_000115 "MT326090.1" .
?sample ?p ?o .
-}
+}
Now we know how to get at the origin we can do it the other way round @@ -570,15 +565,11 @@ and fetch all sequences referring to Washington state
-select ?seq ?sample -{ - ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample . - ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q1223> -} +select ?seq ?sample +{ + ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample . + ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q1223> +}
+Likewise to list all sequences from Turkey we can find the wikidata +entity is Q43: +
+ +select ?seq ?sample +{ + ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample . + ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q43> +} +
The public sequence uploader collects sequences, raw data and @@ -601,8 +607,8 @@ referenced in publications and origins are citeable.
The overall effort was due to magnificent freely donated input by a @@ -617,7 +623,7 @@ Garrison this initiative would not have existed!
This means that when someone uploads a SARS-CoV-2 sequence using one @@ -253,18 +267,28 @@ which triggers a rerun of our workflows.
+Workflows are written in the common workflow language (CWL) and listed +on github. PubSeq being an open project these workflows can be studied +and modified! +
+Work in progress!
The public sequence resource uses multiple data formats listed on the -DOWNLOAD page. One of the most exciting features is the full support +download page. One of the most exciting features is the full support for RDF and semantic web/linked data ontologies. This technology allows for querying data in unprescribed ways - that is, you can formulate your own queries without dealing with a preset model of that data (so typical of CSV files and SQL tables). Examples of exploring -data are listed here. +data are listed here.
In this BLOG we are going to look at the metadata entered on the -COVID-19 PubSeq website (or command line client). It is important to +COVID-19 PubSeq website (or command line client). It is important to understand that anyone, including you, can change that information!
The default metadata schema is listed here.
@@ -274,8 +289,8 @@ The default metadata schema is listed
-
Using the schema we use pyshex shex expressions and schema salad to
@@ -285,13 +300,13 @@ All from that one metadata schema.
-One of the first things we wanted to do is to add a field for the data
-license. Initially we only support CC-4.0 as a license by default, but
-now we want to give uploaders the option to make it an even more
+One of the first things we want to do is to add a field for the data
+license. Initially we only supported CC-4.0 as a license, but
+we wanted to give uploaders the option to use an even more
liberal CC0 license. The first step is to find a good ontology term
for the field. Searching for `creative commons cc0 rdf' rendered this
useful page. We also find an overview where CC0 is represented as URI
@@ -302,13 +317,148 @@ attributionName and attributionURL.
-Note: work in progress
+A minimal triple should be
+
+Other suggestions are
+
+and 'dc:source' which indicates the original source of any modified
+work, specified as a URI.
+The prefix 'cc:' is an abbreviation for http://creativecommons.org/ns#.
+
+Going back to the schema, where does it fit? Under host, sample,
+virus, technology or submitter block? It could fit under sample, but
+actually the license concerns the whole metadata block and sequence,
+so I think we can fit under its own license tag. For example
+
+id: placeholder
+
+So, let's update the example. Notice the license info is optional - if it is missing
+we just assume the default CC-4.0.
+
+One thing that is interesting is that in the name space https://creativecommons.org/ns there
+is no mention of a title. I think it is useful, however, because we have no such field.
+So, we'll add it simply as a title field. Now the draft schema is
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted here and reviewed on the
+gitter channel and I merged it.
+
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in main.py in the 'generateform' function.
+
+With this patch the website adds the license input fields on the form.
+
+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This
+was done with this patch. In the end we decided to use the Dublin core title,
+http://purl.org/metadata/dublin_core_elements#Title:
+
+Note that cc:AttributionSource is not really defined in the cc standard.
+
+When pushing the license info we discovered the workflow broke because
+the existing data had no licensing info. So we changed the license
+field to be optional - a missing license assumes it is CC-BY-4.0.
+3 How is the website generated?
+3 How is the website generated?
4 Modifying the schema
+4 Modifying the schema
+id xhtml:license <http://creativecommons.org/licenses/by/4.0/> .
+
+
+
+
+id dc:title "Description" .
+id cc:attributionName "Your Name" .
+id cc:attributionURL <http://resource.org/id>
+
+
+
+
+license:
+ license_type: http://creativecommons.org/licenses/by/4.0/
+ attribution_title: "Sample ID"
+ attribution_name: "John doe, Joe Boe, Jonny Oe"
+ attribution_url: http://covid19.genenetwork.org/id
+ attribution_source: https://www.ncbi.nlm.nih.gov/pubmed/323088888
+
+
+
+- name: licenseSchema
+ type: record
+ fields:
+ license_type:
+ doc: License types as refined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#License
+ title:
+ doc: Attribution title related to license
+ type: string?
+ jsonldPredicate:
+ _id: http://semanticscience.org/resource/SIO_001167
+ attribution_url:
+ doc: Attribution URL related to license
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#Work
+ attribution_source:
+ doc: Attribution source URL
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#Work
+
+5 Adding fields to the form
+:licenseShape{
+ cc:License xsd:string;
+ dc:Title xsd:string ?;
+ cc:attributionName xsd:string ?;
+ cc:attributionURL xsd:string ?;
+ cc:attributionSource xsd:string ?;
+}
+
+6 TODO Testing the license fields
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-12 Sun 06:24.
+
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-16 Thu 03:27.