From b9691c7deae30bd6422fb7b0681572b7b6f78ae3 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 14:16:11 +0100 Subject: Web: add license to input form --- bh20sequploader/bh20seq-schema.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index b3d4d12..29ac22c 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -15,7 +15,7 @@ $graph: fields: license_type: doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf - type: string? + type: string jsonldPredicate: _id: https://creativecommons.org/ns#License title: @@ -258,6 +258,7 @@ $graph: virus: virusSchema technology: technologySchema submitter: submitterSchema + license: licenseSchema id: doc: The subject (eg the fasta/fastq file) that the metadata describes type: string -- cgit v1.2.3 From f4ed46dae20abe5147871495ede2d6ac2b0854bc Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 14:30:56 +0100 Subject: Add RDF output --- bh20sequploader/bh20seq-schema.yml | 9 +++++++-- bh20sequploader/bh20seq-shex.rdf | 24 +++++++++++++++++------- doc/blog/using-covid-19-pubseq-part5.org | 2 ++ 3 files changed, 26 insertions(+), 9 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 29ac22c..c690e8a 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -23,16 +23,21 @@ $graph: type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 + attribution_name: + doc: Attribution NAME related to data license + type: string? + jsonldPredicate: + _id: https://creativecommons.org/ns#attributionName attribution_url: doc: Attribution URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionURL attribution_source: doc: Attribution source URL related to data license type: string? jsonldPredicate: - _id: https://creativecommons.org/ns#Work + _id: https://creativecommons.org/ns#attributionSource - name: hostSchema type: record diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 965229c..c48267d 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -1,6 +1,7 @@ PREFIX : PREFIX MainSchema: PREFIX hostSchema: +PREFIX cc: PREFIX xsd: PREFIX obo: PREFIX sio: @@ -15,10 +16,11 @@ PREFIX wikidata: MainSchema:submitter @:submitterShape ; MainSchema:technology @:technologyShape ; MainSchema:virus @:virusShape; + MainSchema:license @:licenseShape; } :hostShape { - efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; + efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; sio:SIO_000115 xsd:string ?; obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?; obo:PATO_0000011 xsd:integer ?; @@ -32,14 +34,14 @@ PREFIX wikidata: :sampleShape { sio:SIO_000115 xsd:string; - evs:C25164 xsd:string; - obo:GAZ_00000448 [wikidata:~] ; + evs:C25164 xsd:string; + obo:GAZ_00000448 [wikidata:~] ; obo:OBI_0001895 xsd:string ?; obo:NCIT_C41206 xsd:string ?; obo:OBI_0001479 IRI {0,2}; obo:OBI_0001472 xsd:string ?; sio:SIO_001167 xsd:string ?; - edam:data_2091 IRI {0,3}; + edam:data_2091 IRI {0,3}; } :submitterShape { @@ -47,7 +49,7 @@ PREFIX wikidata: sio:SIO_000116 xsd:string *; sio:SIO_000172 xsd:string ?; obo:NCIT_C37984 xsd:string ?; - obo:NCIT_C37900 xsd:string ?; + obo:NCIT_C37900 xsd:string ?; efo:EFO_0001741 xsd:string ?; obo:NCIT_C42781 xsd:string ?; obo:NCIT_C19026 xsd:string ?; @@ -63,6 +65,14 @@ PREFIX wikidata: } :virusShape{ - edam:data_1875 [ obo:NCBITaxon_~ ] ; - sio:SIO_010055 xsd:string ?; + edam:data_1875 [ obo:NCBITaxon_~ ] ; + sio:SIO_010055 xsd:string ?; } + +:licenseShape{ + cc:License xsd:string; + sio:SIO_001167 xsd:string ?; + cc:attributionName xsd:string ?; + cc:attributionURL xsd:string ?; + cc:attributionSource xsd:string ?; +} \ No newline at end of file diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org index aa06d5e..cb11f43 100644 --- a/doc/blog/using-covid-19-pubseq-part5.org +++ b/doc/blog/using-covid-19-pubseq-part5.org @@ -125,4 +125,6 @@ To add the new fields to the form we have to modify it a little. If we go to the upload form we need to add the license box. The schema is loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function. +With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form. + /Note: work in progress/ -- cgit v1.2.3 From 712614e5627e54df7ec6ab975dc86a1055051455 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Wed, 15 Jul 2020 14:54:59 +0100 Subject: License RDF --- bh20sequploader/bh20seq-schema.yml | 3 ++- bh20sequploader/bh20seq-shex.rdf | 3 ++- doc/blog/using-covid-19-pubseq-part5.org | 29 +++++++++++++++++++++++------ 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index c690e8a..ef55c55 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,6 +1,7 @@ $base: http://biohackathon.org/bh20-seq-schema $namespaces: cc: http://creativecommons.org/ns# + dc: http://purl.org/metadata/dublin_core_elements# sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ obo: http://purl.obolibrary.org/obo/ @@ -22,7 +23,7 @@ $graph: doc: Attribution title related to data license type: string? jsonldPredicate: - _id: http://semanticscience.org/resource/SIO_001167 + _id: http://purl.org/metadata/dublin_core_elements#Title attribution_name: doc: Attribution NAME related to data license type: string? diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index c48267d..9fab334 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -2,6 +2,7 @@ PREFIX : PREFIX hostSchema: PREFIX cc: +PREFIX dc: PREFIX xsd: PREFIX obo: PREFIX sio: @@ -71,7 +72,7 @@ PREFIX wikidata: :licenseShape{ cc:License xsd:string; - sio:SIO_001167 xsd:string ?; + dc:Title xsd:string ?; cc:attributionName xsd:string ?; cc:attributionURL xsd:string ?; cc:attributionSource xsd:string ?; diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org index cb11f43..98c2c31 100644 --- a/doc/blog/using-covid-19-pubseq-part5.org +++ b/doc/blog/using-covid-19-pubseq-part5.org @@ -14,19 +14,20 @@ - [[#how-is-the-website-generated][How is the website generated?]] - [[#modifying-the-schema][Modifying the schema]] - [[#adding-fields-to-the-form][Adding fields to the form]] + - [[#testing-the-license-fields][Testing the license fields]] * Modify Metadata The public sequence resource uses multiple data formats listed on the -[[./download][DOWNLOAD]] page. One of the most exciting features is the full support +[[http://covid19.genenetwork.org/download][download]] page. One of the most exciting features is the full support for RDF and semantic web/linked data ontologies. This technology allows for querying data in unprescribed ways - that is, you can formulate your own queries without dealing with a preset model of that data (so typical of CSV files and SQL tables). Examples of exploring -data are listed [[./blog?id=using-covid-19-pubseq-part1][here]]. +data are listed [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][here]]. In this BLOG we are going to look at the metadata entered on the -[[./][COVID-19 PubSeq]] website (or command line client). It is important to +COVID-19 PubSeq website (or command line client). It is important to understand that anyone, including you, can change that information! * What is the schema? @@ -42,8 +43,8 @@ All from that one metadata schema. * Modifying the schema One of the first things we want to do is to add a field for the data -license. Initially we only support CC-4.0 as a license by default, but -now we want to give uploaders the option to make it an even more +license. Initially we only supported CC-4.0 as a license, but +we wanted to give uploaders the option to use an even more liberal CC0 license. The first step is to find a good ontology term for the field. Searching for `creative commons cc0 rdf' rendered this useful [[https://creativecommons.org/ns][page]]. We also find an [[https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview][overview]] where CC0 is represented as URI @@ -127,4 +128,20 @@ loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form. -/Note: work in progress/ +Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This +was done with this [[https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc][patch]]. In the end we decided to use the Dublin core title, +http://purl.org/metadata/dublin_core_elements#Title: + +#+BEGIN_SRC js +:licenseShape{ + cc:License xsd:string; + dc:Title xsd:string ?; + cc:attributionName xsd:string ?; + cc:attributionURL xsd:string ?; + cc:attributionSource xsd:string ?; +} +#+END_SRC + +Note that cc:AttributionSource is not really defined in the cc standard. + +* TODO Testing the license fields -- cgit v1.2.3 From 01b192417f858d4389226b5130a430bd3b6d4416 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 16 Jul 2020 09:25:58 +0100 Subject: Make license optional for now --- bh20sequploader/bh20seq-schema.yml | 2 +- bh20sequploader/bh20seq-shex.rdf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ef55c55..ee852fa 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -16,7 +16,7 @@ $graph: fields: license_type: doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf - type: string + type: string? jsonldPredicate: _id: https://creativecommons.org/ns#License title: diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 9fab334..7331e86 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -71,7 +71,7 @@ PREFIX wikidata: } :licenseShape{ - cc:License xsd:string; + cc:License xsd:string ?; dc:Title xsd:string ?; cc:attributionName xsd:string ?; cc:attributionURL xsd:string ?; -- cgit v1.2.3 From a10569f51072569604b2384e6e4d583b36de73c4 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 12:01:38 -0400 Subject: Make license optional for now Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-schema.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index ee852fa..0aead3b 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -16,7 +16,7 @@ $graph: fields: license_type: doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf - type: string? + type: string jsonldPredicate: _id: https://creativecommons.org/ns#License title: @@ -264,7 +264,7 @@ $graph: virus: virusSchema technology: technologySchema submitter: submitterSchema - license: licenseSchema + license: ["null", licenseSchema] id: doc: The subject (eg the fasta/fastq file) that the metadata describes type: string -- cgit v1.2.3 From 0e84b18cb134855d572d1f94d5d3c43571afe7e9 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 12:04:26 -0400 Subject: Make license optional Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-shex.rdf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 7331e86..bbc7309 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -17,7 +17,7 @@ PREFIX wikidata: MainSchema:submitter @:submitterShape ; MainSchema:technology @:technologyShape ; MainSchema:virus @:virusShape; - MainSchema:license @:licenseShape; + MainSchema:license @:licenseShape ?; } :hostShape { @@ -71,7 +71,7 @@ PREFIX wikidata: } :licenseShape{ - cc:License xsd:string ?; + cc:License xsd:string ; dc:Title xsd:string ?; cc:attributionName xsd:string ?; cc:attributionURL xsd:string ?; -- cgit v1.2.3 From d34374f0e822edd1539ea5de6f8522f2b761de3f Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 14:48:22 -0400 Subject: Improve uploader reporting. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/main.py | 22 ++++++++++------------ bh20simplewebuploader/main.py | 4 ++-- bh20simplewebuploader/templates/error.html | 2 +- bh20simplewebuploader/templates/success.html | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index f744a8c..6049bf9 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): try: log.debug("Checking metadata" if do_qc else "Skipping metadata check") if do_qc and not qc_metadata(metadata.name): - log.warning("Failed metadata qc") + log.warning("Failed metadata QC") failed = True except Exception as e: - log.debug(e) - print(e) + log.exception("Failed metadata QC") failed = True target = [] @@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): target[0] = ("reads_1."+target[0][0][6:], target[0][1]) target[1] = ("reads_2."+target[1][0][6:], target[0][1]) except Exception as e: - log.debug(e) - print(e) + log.exception("Failed sequence QC") failed = True if failed: @@ -82,7 +80,7 @@ def main(): seqlabel = target[0][1] if args.validate: - print("Valid") + log.info("Valid") exit(0) col = arvados.collection.Collection(api_client=api) @@ -91,10 +89,10 @@ def main(): if args.sequence_p2: upload_sequence(col, target[1], args.sequence_p2) - print("Reading metadata") + log.info("Reading metadata") with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) - print(r[0:20]) + log.info(r[0:20]) while r: f.write(r) r = args.metadata.read(65536) @@ -118,7 +116,7 @@ def main(): ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. - print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) + log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) exit(1) if args.trusted: @@ -131,9 +129,9 @@ def main(): (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) - print("Saved to %s" % col.manifest_locator()) - - print("Done") + log.info("Saved to %s" % col.manifest_locator()) + log.info("Done") + exit(0) if __name__ == "__main__": main() diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index 62b68d9..c814f30 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -445,12 +445,12 @@ def receive_files(): if result.returncode != 0: # It didn't work. Complain. - error_message="Uploader returned value {} and said:".format(result.returncode) + str(result.stderr.decode('utf-8')) + error_message="Uploader returned value {} and said:\n".format(result.returncode) + str(result.stderr.decode('utf-8')) print(error_message, file=sys.stderr) return (render_template('error.html', error_message=error_message), 403) else: # It worked. Say so. - return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + return render_template('success.html', log=result.stderr.decode('utf-8', errors='replace')) finally: shutil.rmtree(dest_dir) diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html index b1d9402..fc08aed 100644 --- a/bh20simplewebuploader/templates/error.html +++ b/bh20simplewebuploader/templates/error.html @@ -15,7 +15,7 @@

- Click here to try again. + Click here to try again.


diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html index 9f0987c..c2302fa 100644 --- a/bh20simplewebuploader/templates/success.html +++ b/bh20simplewebuploader/templates/success.html @@ -9,7 +9,7 @@

Upload Successful


- Your files have been uploaded. They should soon appear as output of the Public SARS-CoV-2 Sequence Resource. + Your files have been uploaded. You can track their QC status, once validated they will be part of the Public SARS-CoV-2 Sequence Resource.

The upload log was: -- cgit v1.2.3 From b1750731b654be3322a6793f47d52fafcaaea9ac Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 21:24:05 -0400 Subject: Report similarity == 0 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/qc_fasta.py | 4 +--- scripts/cleanup.py | 7 +++++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'bh20sequploader') diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index 37eb4e8..0c7e16d 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True): except Exception as e: logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e) - if similarity and similarity < 70.0: + if similarity < 70.0: raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity)) - if similarity == 0: - raise ValueError("QC fail") return ("sequence.fasta"+gz, seqlabel) elif seq_type == "text/fastq": diff --git a/scripts/cleanup.py b/scripts/cleanup.py index 6a82659..78f34c8 100644 --- a/scripts/cleanup.py +++ b/scripts/cleanup.py @@ -9,11 +9,14 @@ delete_patterns = [ "%missing%`host_species`%", "%QC fail: alignment%", "%does not look like a valid URI%", - "%Duplicate of%" + "%Duplicate of%", + "%No matching triples found for predicate obo:NCIT_C42781%", + "%does not look like a valid URI%" ] revalidate_patterns = [ - "%missing%`license`%" + "%missing%`license`%", + "%QC fail%" ] for p in delete_patterns: -- cgit v1.2.3