From 53a47a18a45270c3f037fabe9cc973c66bfe50bf Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 8 Jan 2021 23:44:47 +0100 Subject: sample_id in the FASTA has to match the sample_id in the YAML --- workflows/yamlfa2ttl/check_metadata.cwl | 7 +++++-- workflows/yamlfa2ttl/check_metadata.py | 28 ++++++++++++++++++++++++---- workflows/yamlfa2ttl/check_sequence.py | 4 ++-- workflows/yamlfa2ttl/yamlfa2ttl.cwl | 20 +++++++------------- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/workflows/yamlfa2ttl/check_metadata.cwl b/workflows/yamlfa2ttl/check_metadata.cwl index 72c4d36..593155b 100644 --- a/workflows/yamlfa2ttl/check_metadata.cwl +++ b/workflows/yamlfa2ttl/check_metadata.cwl @@ -12,13 +12,16 @@ inputs: path_yaml: type: string inputBinding: {position: 2} + path_fasta: + type: string + inputBinding: {position: 3} path_schema_yaml: type: File - inputBinding: {position: 3} + inputBinding: {position: 4} default: {class: File, location: ../../bh20sequploader/bh20seq-schema.yml} path_shex_rdf: type: File - inputBinding: {position: 4} + inputBinding: {position: 5} default: {class: File, location: ../../bh20sequploader/bh20seq-shex.rdf} outputs: [] diff --git a/workflows/yamlfa2ttl/check_metadata.py b/workflows/yamlfa2ttl/check_metadata.py index 05494ca..6dd0d5b 100644 --- a/workflows/yamlfa2ttl/check_metadata.py +++ b/workflows/yamlfa2ttl/check_metadata.py @@ -6,12 +6,14 @@ import schema_salad.jsonld_context from pyshex.evaluate import evaluate path_yaml = sys.argv[1] -path_schema_yaml = sys.argv[2] -path_shex_rdf = sys.argv[3] +path_fasta = sys.argv[2] +path_schema_yaml = sys.argv[3] +path_shex_rdf = sys.argv[4] with open(path_schema_yaml, "rb") as f: cache = { - "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode("utf-8") + "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode( + "utf-8") } metadata_schema = schema_salad.schema.load_schema( @@ -29,7 +31,10 @@ with open(path_shex_rdf, "rb") as f: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, path_yaml, True) g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) -rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape") +rslt, reason = evaluate( + g, shex, doc["id"], + "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape" +) # As part of QC make sure serialization works too, this will raise # an exception if there are invalid URIs. @@ -37,3 +42,18 @@ g.serialize(format="ntriples") if not rslt: raise Exception(reason) + +# The sample_id in the FASTA header has to equal to the sample_id in the YAML file +sample_id_from_metadata = metadata['sample']['sample_id'] + +sample_id_from_fasta = '' + +with open(path_fasta) as f: + for line in f: + sample_id_from_fasta = line.strip().split(' ')[0][1:] + break + +if sample_id_from_metadata != sample_id_from_fasta: + raise ValueError( + f"sample_id in the YAML file '{sample_id_from_metadata}' is different from the sample_id in the FASTA '{sample_id_from_fasta}'" + ) diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py index f92bf6d..58a65b9 100644 --- a/workflows/yamlfa2ttl/check_sequence.py +++ b/workflows/yamlfa2ttl/check_sequence.py @@ -26,7 +26,7 @@ def read_single_fasta(path_fasta): return header, ''.join(sequence) -print("FASTA QC: checking similarity to the reference") +print("FASTA QC: checking similarity to the reference", file=sys.stderr) header, sequence = read_single_fasta(path_fasta) @@ -46,7 +46,7 @@ with tempfile.NamedTemporaryFile() as tmp_fasta: "minimap2", "-c", "-x", "asm20", tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name ] - print(" ".join(cmd)) + print(" ".join(cmd), file=sys.stderr) result = subprocess.run(cmd, stdout=subprocess.PIPE) result.check_returncode() diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl index 2913e99..563f00f 100644 --- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl +++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl @@ -1,10 +1,3 @@ -~/.config/guix/current/bin/guix environment -C guix --ad-hoc cwltool python python-biopython python-requests python-dateutil python-magic ruby -cwltool --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta - -cwltool --no-container --preserve-environment GUIX_ENVIRONMENT --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta - - - #!/usr/bin/env cwl-runner cwlVersion: v1.1 @@ -12,7 +5,7 @@ class: Workflow doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)" inputs: - path_fasta: + path-fasta: type: string doc: input FASTA to validate @@ -20,14 +13,14 @@ inputs: type: string default: text/fasta - path_yaml: + path-yaml: type: string doc: input YAML to validate and convert in TTL steps: check_format: in: - path_fasta: path_fasta + path_fasta: path-fasta format_to_check: format_to_check doc: the input has to be a valid FASTA format file out: [] @@ -35,15 +28,16 @@ steps: check_sequence: in: - path_fasta: path_fasta + path_fasta: path-fasta doc: the input sequence has to be enough similar to the reference out: [] run: check_sequence.cwl check_metadata: in: - path_yaml: path_yaml - doc: the input metadata information to put in the knowledge graph + path_yaml: path-yaml + path_fasta: path-fasta + doc: check the input metadata information to put in the knowledge graph out: [] run: check_metadata.cwl -- cgit v1.2.3