diff options
-rw-r--r-- | workflows/yamlfa2ttl/check_format.cwl | 23 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/check_format.py | 31 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/check_sequence.cwl | 20 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/check_sequence.py | 63 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/yamlfa2ttl.cwl | 32 |
5 files changed, 169 insertions, 0 deletions
diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl new file mode 100644 index 0000000..9eee58a --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.cwl @@ -0,0 +1,23 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 + +inputs: + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: check_format.py} + path_fasta: + type: string + inputBinding: {position: 2} + format_to_check: + type: string + inputBinding: {position: 3} + path_valid_formats: + type: File + inputBinding: {position: 4} + default: {class: File, location: ../../bh20sequploader/validation/formats} + +outputs: [] diff --git a/workflows/yamlfa2ttl/check_format.py b/workflows/yamlfa2ttl/check_format.py new file mode 100644 index 0000000..4472b18 --- /dev/null +++ b/workflows/yamlfa2ttl/check_format.py @@ -0,0 +1,31 @@ +import gzip +import tempfile +import magic +import io +import sys + +path_fasta = sys.argv[1] +format_to_check = sys.argv[2] +path_valid_formats = sys.argv[3] + +with tempfile.NamedTemporaryFile() as tmp: + with open(path_valid_formats, 'rb') as f: + tmp.write(f.read()) + tmp.flush() + + check_format = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True) + +with open(path_fasta, "rb") as f: + gz = "" + if path_fasta.endswith(".gz"): + gz = ".gz" + f = gzip.GzipFile(fileobj=f, mode='rb') + + f = io.TextIOWrapper(f) + + buffer = f.read(4096) + seq_type = check_format.from_buffer(buffer).lower() + f.detach() + + if seq_type != format_to_check: + raise ValueError(f"Input file ({path_fasta}) does not look like a {format_to_check}") diff --git a/workflows/yamlfa2ttl/check_sequence.cwl b/workflows/yamlfa2ttl/check_sequence.cwl new file mode 100644 index 0000000..946948d --- /dev/null +++ b/workflows/yamlfa2ttl/check_sequence.cwl @@ -0,0 +1,20 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 + +inputs: + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: check_sequence.py} + path_fasta: + type: string + inputBinding: {position: 2} + path_sars_cov_2_reference_fasta: + type: File + inputBinding: {position: 3} + default: {class: File, location: ../../bh20sequploader/SARS-CoV-2-reference.fasta} + +outputs: [] diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py new file mode 100644 index 0000000..f92bf6d --- /dev/null +++ b/workflows/yamlfa2ttl/check_sequence.py @@ -0,0 +1,63 @@ +import subprocess +import tempfile +import sys + +path_fasta = sys.argv[1] +path_sars_cov_2_reference_fasta = sys.argv[2] + + +def read_single_fasta(path_fasta): + with open(path_fasta) as f: + entries = 0 + header = None + sequence = [] + for line in f: + line = line.strip() + + if line.startswith(">"): + header = line + entries += 1 + else: + sequence.append(line) + + if entries > 1: + raise ValueError("Input FASTA contains multiple entries") + + return header, ''.join(sequence) + + +print("FASTA QC: checking similarity to the reference") + +header, sequence = read_single_fasta(path_fasta) + +similarity = 0 + +with tempfile.NamedTemporaryFile() as tmp_fasta: + with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta: + with open(path_sars_cov_2_reference_fasta, 'rb') as f: + tmp_sars_cov_2_reference_fasta.write(f.read()) + tmp_sars_cov_2_reference_fasta.flush() + + tmp_fasta.write(f'>{header}\n'.encode("utf8")) + tmp_fasta.write(sequence.encode("utf8")) + tmp_fasta.flush() + + cmd = [ + "minimap2", "-c", "-x", "asm20", + tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name + ] + print(" ".join(cmd)) + + result = subprocess.run(cmd, stdout=subprocess.PIPE) + result.check_returncode() + + paf_output = result.stdout.decode("utf-8") + paf_output = paf_output.split("\t") + if len(paf_output) >= 10: + # Number of matching bases in the mapping / Target sequence length + similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0 + +if similarity < 70.0: + raise ValueError( + f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')" + ) diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl new file mode 100644 index 0000000..143fc9d --- /dev/null +++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl @@ -0,0 +1,32 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: Workflow +doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)" + +inputs: + path_fasta: + type: string + doc: input fasta to validate + + format_to_check: + type: string + default: text/fasta + +steps: + check_format: + in: + path_fasta: path_fasta + format_to_check: format_to_check + doc: the input has to be a valid FASTA format file + out: [] + run: check_format.cwl + + check_sequence: + in: + path_fasta: path_fasta + doc: the input sequence has to be enough similar to the reference + out: [] + run: check_sequence.cwl + +outputs: [] |