From 1219eaf496c899f3043b90e30eb956f0f363bfb3 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 20 Apr 2020 12:50:03 -0400 Subject: Add ShEx validation Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/bh20seq-shex.rdf | 59 ++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 11 ++++++- bh20sequploader/qc_metadata.py | 13 +++++++- bh20sequploader/validation_shape.rdf | 59 ------------------------------------ example/metadata.yaml | 2 +- setup.py | 7 +++-- 6 files changed, 87 insertions(+), 64 deletions(-) create mode 100644 bh20sequploader/bh20seq-shex.rdf delete mode 100644 bh20sequploader/validation_shape.rdf diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf new file mode 100644 index 0000000..8d3f5fc --- /dev/null +++ b/bh20sequploader/bh20seq-shex.rdf @@ -0,0 +1,59 @@ +PREFIX : +PREFIX MainSchema: +PREFIX hostSchema: +PREFIX xsd: +PREFIX obo: +PREFIX sio: +PREFIX efo: +PREFIX evs: +PREFIX edam: + +:submissionShape { + MainSchema:host @:hostShape ; + MainSchema:sample @:sampleShape ; + MainSchema:submitter @:submitterShape ; + MainSchema:technology @:technologyShape ; + MainSchema:virus @:virusShape; +} + +:hostShape { + efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; + obo:PATO_0000047 [ obo:NCIT_C20197 obo:NCIT_C27993 obo:NCIT_C17998 ] ; + sio:SIO_000115 xsd:string ; + obo:PATO_0000011 xsd:integer ?; + obo:NCIT_C42574 [ obo:UO_~ ] ?; + sio:SIO_001167 xsd:string ?; + efo:EFO_0000727 xsd:string ?; +} + +:sampleShape { + obo:OBI_0001895 xsd:string ; + sio:SIO_000115 xsd:string ; + sio:SIO_001167 xsd:string ; + evs:C25164 xsd:string ?; + obo:GAZ_00000448 [obo:GAZ_~] ?; + obo:OBI_0001472 xsd:string ?; + obo:OBI_0001479 xsd:string ?; +} + +:submitterShape { + sio:SIO_000116 xsd:string ; + obo:NCIT_C37984 xsd:string ; + obo:NCIT_C37900 xsd:string ?; + obo:NCIT_C42781 xsd:string ?; + obo:OBI_0600047 xsd:string ?; + sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; + sio:SIO_000172 xsd:string ?; + efo:EFO_0001741 xsd:string ?; +} + +:technologyShape { + obo:OBI_0600047 xsd:string ; + obo:FLU_0000848 xsd:integer ?; + efo:EFO_0002699 xsd:string ?; +} + +:virusShape{ + edam:data_1875 [ obo:NCBITaxon_~ ] ; + sio:SIO_010055 xsd:string ?; +} \ No newline at end of file diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index e0a6a9a..49d012d 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -21,16 +21,25 @@ def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') + parser.add_argument("--validate", action="store_true", help="Dry run, validate only") args = parser.parse_args() api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - target = qc_fasta(args.sequence) + try: + target = qc_fasta(args.sequence) + except ValueError as e: + print(e) + exit(1) if not qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) + if args.validate: + print("Valid") + exit(0) + col = arvados.collection.Collection(api_client=api) with col.open(target, "w") as f: diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index e477f21..fbfd286 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,9 +1,12 @@ import schema_salad.schema import schema_salad.ref_resolver +import schema_salad.jsonld_context import logging import pkg_resources import logging import traceback +from rdflib import Graph, Namespace +from pyshex.evaluate import evaluate def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") @@ -13,13 +16,21 @@ def qc_metadata(metadatafile): schema_metadata, metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) + shex = pkg_resources.resource_stream(__name__, "bh20seq-shex.rdf").read().decode("utf-8") + if not isinstance(avsc_names, schema_salad.avro.schema.Names): print(avsc_names) return False try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) - return True + g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) + rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape") + + if not rslt: + print(reason) + + return rslt except Exception as e: traceback.print_exc() logging.warn(e) diff --git a/bh20sequploader/validation_shape.rdf b/bh20sequploader/validation_shape.rdf deleted file mode 100644 index dd34fa9..0000000 --- a/bh20sequploader/validation_shape.rdf +++ /dev/null @@ -1,59 +0,0 @@ -PREFIX : -PREFIX MainSchema: -PREFIX hostSchema: -PREFIX xsd: -PREFIX obo: -PREFIX sio: -PREFIX efo: -PREFIX evs: -PREFIX edam: - -:submissionShape { - MainSchema:host @:hostShape ; - MainSchema:sample @:sampleShape ; - MainSchema:submitter @:submitterShape ; - MainSchema:technology @:technologyShape ; - MainSchema:virus @:virusShape; -} - -:hostShape { - efo:EFO_0000532 [ obo:NCBITaxon_~ ] ; - obo:PATO_0000047 [ obo:NCIT_C20197 obo:NCIT_C27993 obo:NCIT_C17998 ] ; - sio:SIO_000115 xsd:string ; - obo:PATO_0000011 xsd:integer ?; - obo:NCIT_C42574 [ obo:UO_~ ] ?; - sio:SIO_001167 xsd:string ?; - efo:EFO_0000727 xsd:string ?; -} - -:sampleShape { - obo:OBI_0001895 xsd:string ; - sio:SIO_000115 xsd:string ; - sio:SIO_001167 xsd:string ; - evs:C25164 xsd:string ?; - obo:GAZ_00000448 [obo:GAZ_~] ?; - obo:OBI_0001472 xsd:string ?; - obo:OBI_0001479 xsd:string ?; -} - -:submitterShape { - sio:SIO_000116 xsd:string ; - obo:NCIT_C37984 xsd:string ; - obo:NCIT_C37900 xsd:string ?; - obo:NCIT_C42781 xsd:string ?; - obo:OBI_0600047 xsd:string ?; - sio:SIO_000115 /https:\u002F\u002Forcid.org\u002F.{4}-.{4}-.{4}-.{4}/?; - sio:SIO_000172 xsd:string ?; - efo:EFO_0001741 xsd:string ?; -} - -:technologyShape { - obo:OBI_0600047 xsd:string ; - obo:FLU_0000848 xsd:integer ?; - efo:EFO_0002699 xsd:string ?; -} - -:virusShape{ - edam:data_1875 [ obo:NCBITaxon_~ ] ; - sio:SIO_010055 xsd:string ?; -} \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml index 8bbf980..1e83400 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -17,7 +17,7 @@ sample: collecting_institution: Institute that was responsible of sampeling specimen_source: http://purl.obolibrary.org/obo/NCIT_C155831 specimen_source2: http://purl.obolibrary.org/obo/NCIT_C155835 - collection_date: 2020-01-01 + collection_date: "2020-01-01" collection_location: http://www.wikidata.org/entity/Q148 sample_storage_conditions: XXX additional_collection_information: XXX diff --git a/setup.py b/setup.py index 0e91274..4ab6329 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client", "schema-salad", "python-magic"] +install_requires = ["arvados-python-client", "schema-salad", "python-magic", "pyshex"] web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) @@ -31,7 +31,10 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], - package_data={"bh20sequploader": ["bh20seq-schema.yml", "bh20seq-options.yml", "validation/formats"], + package_data={"bh20sequploader": ["bh20seq-schema.yml", + "bh20seq-options.yml", + "bh20seq-shex.rdf", + "validation/formats"], }, install_requires=install_requires, extras_require={ -- cgit v1.2.3