From 9c9512a7e040f8247d259bdc6f9cf55d5d276baf Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 12:48:12 +0100 Subject: Load metadata locally without pkg_resources --- doc/blog/using-covid-19-pubseq-part5.org | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'doc/blog') diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org index 4b0ea64..aa06d5e 100644 --- a/doc/blog/using-covid-19-pubseq-part5.org +++ b/doc/blog/using-covid-19-pubseq-part5.org @@ -13,6 +13,7 @@ - [[#what-is-the-schema][What is the schema?]] - [[#how-is-the-website-generated][How is the website generated?]] - [[#modifying-the-schema][Modifying the schema]] + - [[#adding-fields-to-the-form][Adding fields to the form]] * Modify Metadata @@ -113,8 +114,15 @@ So, we'll add it simply as a title field. Now the draft schema is _id: https://creativecommons.org/ns#Work #+END_SRC -Now, we are no ontology experts, right? So, next we submit a patch to our source tree and -ask for feedback before wiring it up in the data entry form. The pull request was -submitted here FIXME. +Now, we are no ontology experts, right? So, next we submit a patch to +our source tree and ask for feedback before wiring it up in the data +entry form. The pull request was submitted [[https://github.com/arvados/bh20-seq-resource/pull/97][here]] and reviewed on the +gitter channel and I merged it. + +* Adding fields to the form + +To add the new fields to the form we have to modify it a little. If we +go to the upload form we need to add the license box. The schema is +loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function. /Note: work in progress/ -- cgit v1.2.3 From f4ed46dae20abe5147871495ede2d6ac2b0854bc Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 14:30:56 +0100 Subject: Add RDF output --- bh20sequploader/bh20seq-schema.yml | 9 +++++++-- bh20sequploader/bh20seq-shex.rdf | 24 +++++++++++++++++------- doc/blog/using-covid-19-pubseq-part5.org | 2 ++ 3 files changed, 26 insertions(+), 9 deletions(-) (limited to 'doc/blog') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 29ac22c..c690e8a 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -23,16 +23,21 @@ $graph: type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 + attribution_name: + doc: Attribution NAME related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionName attribution_url: doc: Attribution URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionURL attribution_source: doc: Attribution source URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionSource - name: hostSchema type: record diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 965229c..c48267d 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -1,6 +1,7 @@ PREFIX : PREFIX MainSchema: PREFIX hostSchema: +PREFIX cc: PREFIX xsd: PREFIX obo: PREFIX sio: @@ -15,10 +16,11 @@ PREFIX wikidata: MainSchema:submitter @:submitterShape ; MainSchema:technology @:technologyShape ; MainSchema:virus @:virusShape; + MainSchema:license @:licenseShape; } :hostShape { - efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; + efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; sio:SIO_000115 xsd:string ?; obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?; obo:PATO_0000011 xsd:integer ?; @@ -32,14 +34,14 @@ PREFIX wikidata: :sampleShape { sio:SIO_000115 xsd:string; - evs:C25164 xsd:string; - obo:GAZ_00000448 [wikidata:~] ; + evs:C25164 xsd:string; + obo:GAZ_00000448 [wikidata:~] ; obo:OBI_0001895 xsd:string ?; obo:NCIT_C41206 xsd:string ?; obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; - edam:data_2091 IRI {0,3}; + edam:data_2091 IRI {0,3}; } :submitterShape { @@ -47,7 +49,7 @@ PREFIX wikidata: sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; - obo:NCIT_C37900 xsd:string ?; + obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C42781 xsd:string ?; obo:NCIT_C19026 xsd:string ?; @@ -63,6 +65,14 @@ PREFIX wikidata: } :virusShape{ - edam:data_1875 [ obo:NCBITaxon_~ ] ; - sio:SIO_010055 xsd:string ?; + edam:data_1875 [ obo:NCBITaxon_~ ] ; + sio:SIO_010055 xsd:string ?; } + +:licenseShape{ + cc:License xsd:string; + sio:SIO_001167 xsd:string ?; + cc:attributionName xsd:string ?; + cc:attributionURL xsd:string ?; + cc:attributionSource xsd:string ?; +} \ No newline at end of file diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org index aa06d5e..cb11f43 100644 --- a/doc/blog/using-covid-19-pubseq-part5.org +++ b/doc/blog/using-covid-19-pubseq-part5.org @@ -125,4 +125,6 @@ To add the new fields to the form we have to modify it a little. If we go to the upload form we need to add the license box. The schema is loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function. +With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form. + /Note: work in progress/ -- cgit v1.2.3 From 712614e5627e54df7ec6ab975dc86a1055051455 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 14:54:59 +0100 Subject: License RDF --- bh20sequploader/bh20seq-schema.yml | 3 ++- bh20sequploader/bh20seq-shex.rdf | 3 ++- doc/blog/using-covid-19-pubseq-part5.org | 29 +++++++++++++++++++++++------ 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'doc/blog') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index c690e8a..ef55c55 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,6 +1,7 @@ $base: http://biohackathon.org/bh20-seq-schema $namespaces: cc: http://creativecommons.org/ns# + dc: http://purl.org/metadata/dublin_core_elements# sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ obo: http://purl.obolibrary.org/obo/ @@ -22,7 +23,7 @@ $graph: doc: Attribution title related to data license type: string? jsonldPredicate: - _id: http://semanticscience.org/resource/SIO_001167 + _id: http://purl.org/metadata/dublin_core_elements#Title attribution_name: doc: Attribution NAME related to data license type: string? diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index c48267d..9fab334 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -2,6 +2,7 @@ PREFIX : PREFIX hostSchema: PREFIX cc: +PREFIX dc: PREFIX xsd: PREFIX obo: PREFIX sio: @@ -71,7 +72,7 @@ PREFIX wikidata: :licenseShape{ cc:License xsd:string; - sio:SIO_001167 xsd:string ?; + dc:Title xsd:string ?; cc:attributionName xsd:string ?; cc:attributionURL xsd:string ?; cc:attributionSource xsd:string ?; diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org index cb11f43..98c2c31 100644 --- a/doc/blog/using-covid-19-pubseq-part5.org +++ b/doc/blog/using-covid-19-pubseq-part5.org @@ -14,19 +14,20 @@ - [[#how-is-the-website-generated][How is the website generated?]] - [[#modifying-the-schema][Modifying the schema]] - [[#adding-fields-to-the-form][Adding fields to the form]] + - [[#testing-the-license-fields][Testing the license fields]] * Modify Metadata The public sequence resource uses multiple data formats listed on the -[[./download][DOWNLOAD]] page. One of the most exciting features is the full support +[[http://covid19.genenetwork.org/download][download]] page. One of the most exciting features is the full support for RDF and semantic web/linked data ontologies. This technology allows for querying data in unprescribed ways - that is, you can formulate your own queries without dealing with a preset model of that data (so typical of CSV files and SQL tables). Examples of exploring -data are listed [[./blog?id=using-covid-19-pubseq-part1][here]]. +data are listed [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][here]]. In this BLOG we are going to look at the metadata entered on the -[[./][COVID-19 PubSeq]] website (or command line client). It is important to +COVID-19 PubSeq website (or command line client). It is important to understand that anyone, including you, can change that information! * What is the schema? @@ -42,8 +43,8 @@ All from that one metadata schema. * Modifying the schema One of the first things we want to do is to add a field for the data -license. Initially we only support CC-4.0 as a license by default, but -now we want to give uploaders the option to make it an even more +license. Initially we only supported CC-4.0 as a license, but +we wanted to give uploaders the option to use an even more liberal CC0 license. The first step is to find a good ontology term for the field. Searching for `creative commons cc0 rdf' rendered this useful [[https://creativecommons.org/ns][page]]. We also find an [[https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview][overview]] where CC0 is represented as URI @@ -127,4 +128,20 @@ loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form. -/Note: work in progress/ +Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This +was done with this [[https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc][patch]]. In the end we decided to use the Dublin core title, +http://purl.org/metadata/dublin_core_elements#Title: + +#+BEGIN_SRC js +:licenseShape{ + cc:License xsd:string; + dc:Title xsd:string ?; + cc:attributionName xsd:string ?; + cc:attributionURL xsd:string ?; + cc:attributionSource xsd:string ?; +} +#+END_SRC + +Note that cc:AttributionSource is not really defined in the cc standard. + +* TODO Testing the license fields -- cgit v1.2.3 From 73be46fd1db58f132fa60ff30d33d67927a341a7 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 16 Jul 2020 09:48:31 +0100 Subject: Addes ESR logo and cropped CWL logo --- bh20simplewebuploader/static/image/CWL.png | Bin 0 -> 11066 bytes bh20simplewebuploader/static/image/ESR.png | Bin 0 -> 67869 bytes bh20simplewebuploader/templates/footer.html | 6 +++++- doc/blog/using-covid-19-pubseq-part5.org | 4 ++++ 4 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 bh20simplewebuploader/static/image/CWL.png create mode 100644 bh20simplewebuploader/static/image/ESR.png (limited to 'doc/blog') diff --git a/bh20simplewebuploader/static/image/CWL.png b/bh20simplewebuploader/static/image/CWL.png new file mode 100644 index 0000000..81d1807 Binary files /dev/null and b/bh20simplewebuploader/static/image/CWL.png differ diff --git a/bh20simplewebuploader/static/image/ESR.png b/bh20simplewebuploader/static/image/ESR.png new file mode 100644 index 0000000..557c798 Binary files /dev/null and b/bh20simplewebuploader/static/image/ESR.png differ diff --git a/bh20simplewebuploader/templates/footer.html b/bh20simplewebuploader/templates/footer.html index a1dd4fd..37a6b64 100644 --- a/bh20simplewebuploader/templates/footer.html +++ b/bh20simplewebuploader/templates/footer.html @@ -21,7 +21,7 @@
- +
@@ -29,6 +29,10 @@
+
+ +
+
-
-

1 What does this mean?

+ -
-

2 Fetch sequence data

+
+

2 Fetch sequence data

The latest run of the pipeline can be viewed here. Each of these @@ -339,8 +339,8 @@ these identifiers throughout.

-
-

3 Predicates

+
+

3 Predicates

To explore an RDF dataset, the first query we can do is open and gets @@ -350,10 +350,10 @@ the following in a SPARQL end point

-
select distinct ?p
-{
+
select distinct ?p
+{
    ?o ?p ?s
-}
+}
 
@@ -364,10 +364,10 @@ To get a -
select distinct ?g
-{
-    GRAPH ?g {?s ?p ?o}
-}
+
select distinct ?g
+{
+    GRAPH ?g {?s ?p ?o}
+}
 
@@ -383,10 +383,10 @@ To list all submitters, try

-
select distinct ?s
-{
-   ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?s
-}
+
select distinct ?s
+{
+   ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?s
+}
 
@@ -397,11 +397,11 @@ and by

-
select distinct ?s
-{
-   ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?id .
+
select distinct ?s
+{
+   ?o <http://biohackathon.org/bh20-seq-schema#MainSchema/submitter> ?id .
    ?id ?p ?s
-}
+}
 
@@ -415,12 +415,12 @@ To lift the full URL out of the query you can use a header like

-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?dataset ?submitter
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-}
+}
 
@@ -438,32 +438,32 @@ Now we got this far, lets -
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select (COUNT(distinct ?dataset) as ?num)
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select (COUNT(distinct ?dataset) as ?num)
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-}
+}
 
-
-

4 Fetch submitter info and other metadata

+
+

4 Fetch submitter info and other metadata

To get dataests with submitters we can do the above

-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?dataset ?p ?submitter
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?p ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-}
+}
 
@@ -480,13 +480,13 @@ Let's focus on one sample with

-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?dataset ?submitter
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter .
-   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
-}
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+}
 
@@ -496,12 +496,12 @@ see if we can get a sample ID by listing sample predicates

-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?p
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?p
+{
    ?dataset ?p ?o .
    ?dataset pubseq:submitter ?id .
-}
+}
 
@@ -513,15 +513,15 @@ Let's zoom in on those of Roychoudhury with
-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-select distinct ?sid ?sample ?p1 ?dataset ?submitter
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+select distinct ?sid ?sample ?p1 ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter .
-   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
    ?dataset pubseq:sample ?sid .
    ?sid ?p1 ?sample
-}
+}
 
@@ -532,18 +532,13 @@ this database. Let's focus on one sample "MT326090.1" with predicate

-
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
-PREFIX sio: <http://semanticscience.org/resource/>
-select distinct ?sample ?p ?o
-{
+
PREFIX pubseq: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
+PREFIX sio: <http://semanticscience.org/resource/>
+select distinct ?sample ?p ?o
+{
    ?sample sio:SIO_000115 "MT326090.1" .
    ?sample ?p ?o .
-}
+}
 
@@ -561,8 +556,8 @@ to view/query the database.
-
-

5 Fetch all sequences from Washington state

+
+

5 Fetch all sequences from Washington state

Now we know how to get at the origin we can do it the other way round @@ -570,15 +565,11 @@ and fetch all sequences referring to Washington state

-
-select ?seq ?sample
-{
-    ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample .
-    ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q1223>
-}
+
select ?seq ?sample
+{
+    ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample .
+    ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q1223>
+}
 
@@ -586,11 +577,26 @@ and fetch all sequences referring to Washington state which lists 300 sequences originating from Washington state! Which is almost half of the set coming out of GenBank.

+ +

+Likewise to list all sequences from Turkey we can find the wikidata +entity is Q43: +

+ +
+
select ?seq ?sample
+{
+    ?seq <http://biohackathon.org/bh20-seq-schema#MainSchema/sample> ?sample .
+    ?sample <http://purl.obolibrary.org/obo/GAZ_00000448> <http://www.wikidata.org/entity/Q43>
+}
+
+
+ -
-

6 Discussion

+
+

6 Discussion

The public sequence uploader collects sequences, raw data and @@ -601,8 +607,8 @@ referenced in publications and origins are citeable.

-
-

7 Acknowledgements

+
+

7 Acknowledgements

The overall effort was due to magnificent freely donated input by a @@ -617,7 +623,7 @@ Garrison this initiative would not have existed!

-
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-05-29 Fri 12:06
. +
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-17 Fri 05:02
.
diff --git a/doc/blog/using-covid-19-pubseq-part4.html b/doc/blog/using-covid-19-pubseq-part4.html index b5a05ca..c975c21 100644 --- a/doc/blog/using-covid-19-pubseq-part4.html +++ b/doc/blog/using-covid-19-pubseq-part4.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - + COVID-19 PubSeq (part 4) @@ -161,6 +161,19 @@ .footdef { margin-bottom: 1em; } .figure { padding: 1em; } .figure p { text-align: center; } + .equation-container { + display: table; + text-align: center; + width: 100%; + } + .equation { + vertical-align: middle; + } + .equation-label { + display: table-cell; + text-align: right; + vertical-align: middle; + } .inlinetask { padding: 10px; border: 2px solid gray; @@ -186,7 +199,7 @@ @licstart The following is the entire license notice for the JavaScript code in this tag. -Copyright (C) 2012-2018 Free Software Foundation, Inc. +Copyright (C) 2012-2020 Free Software Foundation, Inc. The JavaScript code in this tag is free software: you can redistribute it and/or modify it under the terms of the GNU @@ -235,15 +248,16 @@ for the JavaScript code in this tag.

Table of Contents

-
-

1 What does this mean?

+
+

1 What does this mean?

This means that when someone uploads a SARS-CoV-2 sequence using one @@ -253,18 +267,28 @@ which triggers a rerun of our workflows.

- -
-

2 Modify Workflow

+
+

2 Where can I find the workflows?

+Workflows are written in the common workflow language (CWL) and listed +on github. PubSeq being an open project these workflows can be studied +and modified! +

+
+
+ +
+

3 Modify Workflow

+
+

Work in progress!

-
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-12 Sun 06:24
. +
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-17 Fri 01:47
.
diff --git a/doc/blog/using-covid-19-pubseq-part5.html b/doc/blog/using-covid-19-pubseq-part5.html index 80bf559..4caa5ac 100644 --- a/doc/blog/using-covid-19-pubseq-part5.html +++ b/doc/blog/using-covid-19-pubseq-part5.html @@ -3,7 +3,7 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - + COVID-19 PubSeq (part 4) @@ -161,6 +161,19 @@ .footdef { margin-bottom: 1em; } .figure { padding: 1em; } .figure p { text-align: center; } + .equation-container { + display: table; + text-align: center; + width: 100%; + } + .equation { + vertical-align: middle; + } + .equation-label { + display: table-cell; + text-align: right; + vertical-align: middle; + } .inlinetask { padding: 10px; border: 2px solid gray; @@ -186,7 +199,7 @@ @licstart The following is the entire license notice for the JavaScript code in this tag. -Copyright (C) 2012-2018 Free Software Foundation, Inc. +Copyright (C) 2012-2020 Free Software Foundation, Inc. The JavaScript code in this tag is free software: you can redistribute it and/or modify it under the terms of the GNU @@ -235,38 +248,40 @@ for the JavaScript code in this tag.

Table of Contents

-
-

1 Modify Metadata

+
+

1 Modify Metadata

The public sequence resource uses multiple data formats listed on the -DOWNLOAD page. One of the most exciting features is the full support +download page. One of the most exciting features is the full support for RDF and semantic web/linked data ontologies. This technology allows for querying data in unprescribed ways - that is, you can formulate your own queries without dealing with a preset model of that data (so typical of CSV files and SQL tables). Examples of exploring -data are listed here. +data are listed here.

In this BLOG we are going to look at the metadata entered on the -COVID-19 PubSeq website (or command line client). It is important to +COVID-19 PubSeq website (or command line client). It is important to understand that anyone, including you, can change that information!

-
-

2 What is the schema?

+
+

2 What is the schema?

The default metadata schema is listed here. @@ -274,8 +289,8 @@ The default metadata schema is listed -

3 How is the website generated?

+
+

3 How is the website generated?

Using the schema we use pyshex shex expressions and schema salad to @@ -285,13 +300,13 @@ All from that one metadata schema.

-
-

4 Modifying the schema

+
+

4 Modifying the schema

-One of the first things we wanted to do is to add a field for the data -license. Initially we only support CC-4.0 as a license by default, but -now we want to give uploaders the option to make it an even more +One of the first things we want to do is to add a field for the data +license. Initially we only supported CC-4.0 as a license, but +we wanted to give uploaders the option to use an even more liberal CC0 license. The first step is to find a good ontology term for the field. Searching for `creative commons cc0 rdf' rendered this useful page. We also find an overview where CC0 is represented as URI @@ -302,13 +317,148 @@ attributionName and attributionURL.

-Note: work in progress +A minimal triple should be +

+ +
+id  xhtml:license  <http://creativecommons.org/licenses/by/4.0/> .
+
+ + +

+Other suggestions are +

+ +
+id  dc:title "Description" .
+id  cc:attributionName "Your Name" .
+id  cc:attributionURL <http://resource.org/id>
+
+ + +

+and 'dc:source' which indicates the original source of any modified +work, specified as a URI. +The prefix 'cc:' is an abbreviation for http://creativecommons.org/ns#. +

+ +

+Going back to the schema, where does it fit? Under host, sample, +virus, technology or submitter block? It could fit under sample, but +actually the license concerns the whole metadata block and sequence, +so I think we can fit under its own license tag. For example +

+ + +

+id: placeholder +

+ +
+license:
+    license_type: http://creativecommons.org/licenses/by/4.0/
+    attribution_title: "Sample ID"
+    attribution_name: "John doe, Joe Boe, Jonny Oe"
+    attribution_url: http://covid19.genenetwork.org/id
+    attribution_source: https://www.ncbi.nlm.nih.gov/pubmed/323088888
+
+ + +

+So, let's update the example. Notice the license info is optional - if it is missing +we just assume the default CC-4.0. +

+ +

+One thing that is interesting is that in the name space https://creativecommons.org/ns there +is no mention of a title. I think it is useful, however, because we have no such field. +So, we'll add it simply as a title field. Now the draft schema is

+ +
+
- name: licenseSchema
+  type: record
+  fields:
+    license_type:
+      doc: License types as refined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#License
+    title:
+      doc: Attribution title related to license
+      type: string?
+      jsonldPredicate:
+          _id: http://semanticscience.org/resource/SIO_001167
+    attribution_url:
+      doc: Attribution URL related to license
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#Work
+    attribution_source:
+      doc: Attribution source URL
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#Work
+
+
+ +

+Now, we are no ontology experts, right? So, next we submit a patch to +our source tree and ask for feedback before wiring it up in the data +entry form. The pull request was submitted here and reviewed on the +gitter channel and I merged it. +

+
+ +
+

5 Adding fields to the form

+
+

+To add the new fields to the form we have to modify it a little. If we +go to the upload form we need to add the license box. The schema is +loaded in main.py in the 'generateform' function. +

+ +

+With this patch the website adds the license input fields on the form. +

+ +

+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This +was done with this patch. In the end we decided to use the Dublin core title, +http://purl.org/metadata/dublin_core_elements#Title: +

+ +
+
:licenseShape{
+    cc:License xsd:string;
+    dc:Title xsd:string ?;
+    cc:attributionName xsd:string ?;
+    cc:attributionURL xsd:string ?;
+    cc:attributionSource xsd:string ?;
+}
+
+
+ +

+Note that cc:AttributionSource is not really defined in the cc standard. +

+ +

+When pushing the license info we discovered the workflow broke because +the existing data had no licensing info. So we changed the license +field to be optional - a missing license assumes it is CC-BY-4.0. +

+
+
+ +
+

6 TODO Testing the license fields

-
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-12 Sun 06:24
. +
Created by Pjotr Prins (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!
Modified 2020-07-16 Thu 03:27
.
-- cgit v1.2.3