diff options
| author | AndreaGuarracino | 2021-01-05 14:18:26 +0100 | 
|---|---|---|
| committer | AndreaGuarracino | 2021-01-05 14:18:26 +0100 | 
| commit | e9f73388c810c678a69ce23c925d7914ba99a251 (patch) | |
| tree | 04731e150e44a2d102f49b431527b73fb1caad2f | |
| parent | 491b39273b54ca89861febf5c27d28f032ba1c49 (diff) | |
| download | bh20-seq-resource-e9f73388c810c678a69ce23c925d7914ba99a251.tar.gz bh20-seq-resource-e9f73388c810c678a69ce23c925d7914ba99a251.tar.lz bh20-seq-resource-e9f73388c810c678a69ce23c925d7914ba99a251.zip | |
added check_format workflow and script
| -rw-r--r-- | workflows/yamlfa2ttl/check_format.cwl | 22 | ||||
| -rw-r--r-- | workflows/yamlfa2ttl/check_format.py | 35 | ||||
| -rw-r--r-- | workflows/yamlfa2ttl/yamlfa2ttl.cwl | 31 | 
3 files changed, 69 insertions, 19 deletions
| diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl new file mode 100644 index 0000000..24de620 --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.cwl @@ -0,0 +1,22 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 +inputs: + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: check_format.py} + path_fasta: + type: string + inputBinding: {position: 2} + format_to_check: + type: string + inputBinding: {position: 3} + path_valid_formats: + type: File + inputBinding: {position: 4} + default: {class: File, location: ../../bh20sequploader/validation/formats} +outputs: [] + diff --git a/workflows/yamlfa2ttl/check_format.py b/workflows/yamlfa2ttl/check_format.py new file mode 100644 index 0000000..a1b1f14 --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.py @@ -0,0 +1,35 @@ +import gzip +import tempfile +import pkg_resources +import magic +import io + +import sys + +path_fasta = sys.argv[1] +format_to_check = sys.argv[2] +path_valid_formats = sys.argv[3] + + +# ../../bh20sequploader/validation/formats + +schema_resource = pkg_resources.resource_stream(__name__, path_valid_formats) +with tempfile.NamedTemporaryFile() as tmp: + tmp.write(schema_resource.read()) + tmp.flush() + check_format = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True) + +with open(path_fasta, "rb") as f: + gz = "" + if path_fasta.endswith(".gz"): + gz = ".gz" + f = gzip.GzipFile(fileobj=f, mode='rb') + + f = io.TextIOWrapper(f) + + buffer = f.read(4096) + seq_type = check_format.from_buffer(buffer).lower() + f.detach() + + if seq_type != format_to_check: + raise ValueError(f"Input file ({path_fasta}) does not look like a {format_to_check}") diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl index 2580794..1dce9ca 100644 --- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl +++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl @@ -1,31 +1,24 @@ #!/usr/bin/env cwl-runner cwlVersion: v1.1 -class: CommandLineTool +class: Workflow doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)" inputs: path_fasta: - type: File - inputBinding: - position: 1 - path_yaml: - type: File - inputBinding: - position: 2 + type: string + doc: input fasta to validate + + format_to_check: + type: string + default: text/fasta steps: check_format: - in: {path_fasta: path_fasta, path_valid_formats: '../../bh20sequploader/validation/formats', format_to_check: 'text/fasta'} - #out: true/false or nothing and it has to block the execution if the format is wrong + in: + path_fasta: path_fasta + format_to_check: format_to_check + out: [] run: check_format.cwl - check_metadata: - # input and output - # run: check_metadata.cwl - check_header: - # id_fasta has to be equal to id_yaml - # run: check_header.cwl - check_sequence: - # The sequence has to be similar to the reference - # run: check_sequence.cwl +outputs: [] | 
