From e9f73388c810c678a69ce23c925d7914ba99a251 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Tue, 5 Jan 2021 14:18:26 +0100 Subject: added check_format workflow and script --- workflows/yamlfa2ttl/check_format.cwl | 22 ++++++++++++++++++++++ workflows/yamlfa2ttl/check_format.py | 35 +++++++++++++++++++++++++++++++++++ workflows/yamlfa2ttl/yamlfa2ttl.cwl | 31 ++++++++++++------------------- 3 files changed, 69 insertions(+), 19 deletions(-) create mode 100644 workflows/yamlfa2ttl/check_format.cwl create mode 100644 workflows/yamlfa2ttl/check_format.py diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl new file mode 100644 index 0000000..24de620 --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.cwl @@ -0,0 +1,22 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: check_format.py} + path_fasta: + type: string + inputBinding: {position: 2} + format_to_check: + type: string + inputBinding: {position: 3} + path_valid_formats: + type: File + inputBinding: {position: 4} + default: {class: File, location: ../../bh20sequploader/validation/formats} +outputs: [] + diff --git a/workflows/yamlfa2ttl/check_format.py b/workflows/yamlfa2ttl/check_format.py new file mode 100644 index 0000000..a1b1f14 --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.py @@ -0,0 +1,35 @@ +import gzip +import tempfile +import pkg_resources +import magic +import io + +import sys + +path_fasta = sys.argv[1] +format_to_check = sys.argv[2] +path_valid_formats = sys.argv[3] + + +# ../../bh20sequploader/validation/formats + +schema_resource = pkg_resources.resource_stream(__name__, path_valid_formats) +with tempfile.NamedTemporaryFile() as tmp: + tmp.write(schema_resource.read()) + tmp.flush() + check_format = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True) + +with open(path_fasta, "rb") as f: + gz = "" + if path_fasta.endswith(".gz"): + gz = ".gz" + f = gzip.GzipFile(fileobj=f, mode='rb') + + f = io.TextIOWrapper(f) + + buffer = f.read(4096) + seq_type = check_format.from_buffer(buffer).lower() + f.detach() + + if seq_type != format_to_check: + raise ValueError(f"Input file ({path_fasta}) does not look like a {format_to_check}") diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl index 2580794..1dce9ca 100644 --- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl +++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl @@ -1,31 +1,24 @@ #!/usr/bin/env cwl-runner cwlVersion: v1.1 -class: CommandLineTool +class: Workflow doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)" inputs: path_fasta: - type: File - inputBinding: - position: 1 - path_yaml: - type: File - inputBinding: - position: 2 + type: string + doc: input fasta to validate + + format_to_check: + type: string + default: text/fasta steps: check_format: - in: {path_fasta: path_fasta, path_valid_formats: '../../bh20sequploader/validation/formats', format_to_check: 'text/fasta'} - #out: true/false or nothing and it has to block the execution if the format is wrong + in: + path_fasta: path_fasta + format_to_check: format_to_check + out: [] run: check_format.cwl - check_metadata: - # input and output - # run: check_metadata.cwl - check_header: - # id_fasta has to be equal to id_yaml - # run: check_header.cwl - check_sequence: - # The sequence has to be similar to the reference - # run: check_sequence.cwl +outputs: [] -- cgit v1.2.3