diff options
author | AndreaGuarracino | 2021-01-07 23:02:54 +0100 |
---|---|---|
committer | AndreaGuarracino | 2021-01-07 23:02:54 +0100 |
commit | 141e619929cee17018417d71111063015e73c366 (patch) | |
tree | baa8f98aaed7aef615d37b5560a988381ac8030f /workflows/yamlfa2ttl | |
parent | 430090a65786b0b9dde9e1319a740f95579d0275 (diff) | |
download | bh20-seq-resource-141e619929cee17018417d71111063015e73c366.tar.gz bh20-seq-resource-141e619929cee17018417d71111063015e73c366.tar.lz bh20-seq-resource-141e619929cee17018417d71111063015e73c366.zip |
added check_sequence workflow and script
Diffstat (limited to 'workflows/yamlfa2ttl')
-rw-r--r-- | workflows/yamlfa2ttl/check_format.cwl | 3 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/check_sequence.cwl | 20 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/check_sequence.py | 63 | ||||
-rw-r--r-- | workflows/yamlfa2ttl/yamlfa2ttl.cwl | 8 |
4 files changed, 93 insertions, 1 deletions
diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl index 24de620..9eee58a 100644 --- a/workflows/yamlfa2ttl/check_format.cwl +++ b/workflows/yamlfa2ttl/check_format.cwl @@ -3,6 +3,7 @@ cwlVersion: v1.1 class: CommandLineTool baseCommand: python3 + inputs: script: type: File @@ -18,5 +19,5 @@ inputs: type: File inputBinding: {position: 4} default: {class: File, location: ../../bh20sequploader/validation/formats} -outputs: [] +outputs: [] diff --git a/workflows/yamlfa2ttl/check_sequence.cwl b/workflows/yamlfa2ttl/check_sequence.cwl new file mode 100644 index 0000000..946948d --- /dev/null +++ b/workflows/yamlfa2ttl/check_sequence.cwl @@ -0,0 +1,20 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.1 +class: CommandLineTool +baseCommand: python3 + +inputs: + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: check_sequence.py} + path_fasta: + type: string + inputBinding: {position: 2} + path_sars_cov_2_reference_fasta: + type: File + inputBinding: {position: 3} + default: {class: File, location: ../../bh20sequploader/SARS-CoV-2-reference.fasta} + +outputs: [] diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py new file mode 100644 index 0000000..f92bf6d --- /dev/null +++ b/workflows/yamlfa2ttl/check_sequence.py @@ -0,0 +1,63 @@ +import subprocess +import tempfile +import sys + +path_fasta = sys.argv[1] +path_sars_cov_2_reference_fasta = sys.argv[2] + + +def read_single_fasta(path_fasta): + with open(path_fasta) as f: + entries = 0 + header = None + sequence = [] + for line in f: + line = line.strip() + + if line.startswith(">"): + header = line + entries += 1 + else: + sequence.append(line) + + if entries > 1: + raise ValueError("Input FASTA contains multiple entries") + + return header, ''.join(sequence) + + +print("FASTA QC: checking similarity to the reference") + +header, sequence = read_single_fasta(path_fasta) + +similarity = 0 + +with tempfile.NamedTemporaryFile() as tmp_fasta: + with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta: + with open(path_sars_cov_2_reference_fasta, 'rb') as f: + tmp_sars_cov_2_reference_fasta.write(f.read()) + tmp_sars_cov_2_reference_fasta.flush() + + tmp_fasta.write(f'>{header}\n'.encode("utf8")) + tmp_fasta.write(sequence.encode("utf8")) + tmp_fasta.flush() + + cmd = [ + "minimap2", "-c", "-x", "asm20", + tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name + ] + print(" ".join(cmd)) + + result = subprocess.run(cmd, stdout=subprocess.PIPE) + result.check_returncode() + + paf_output = result.stdout.decode("utf-8") + paf_output = paf_output.split("\t") + if len(paf_output) >= 10: + # Number of matching bases in the mapping / Target sequence length + similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0 + +if similarity < 70.0: + raise ValueError( + f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')" + ) diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl index 1dce9ca..143fc9d 100644 --- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl +++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl @@ -18,7 +18,15 @@ steps: in: path_fasta: path_fasta format_to_check: format_to_check + doc: the input has to be a valid FASTA format file out: [] run: check_format.cwl + check_sequence: + in: + path_fasta: path_fasta + doc: the input sequence has to be enough similar to the reference + out: [] + run: check_sequence.cwl + outputs: [] |