diff options
Diffstat (limited to 'workflows/yamlfa2ttl/check_sequence.py')
-rw-r--r-- | workflows/yamlfa2ttl/check_sequence.py | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py new file mode 100644 index 0000000..f92bf6d --- /dev/null +++ b/workflows/yamlfa2ttl/check_sequence.py @@ -0,0 +1,63 @@ +import subprocess +import tempfile +import sys + +path_fasta = sys.argv[1] +path_sars_cov_2_reference_fasta = sys.argv[2] + + +def read_single_fasta(path_fasta): + with open(path_fasta) as f: + entries = 0 + header = None + sequence = [] + for line in f: + line = line.strip() + + if line.startswith(">"): + header = line + entries += 1 + else: + sequence.append(line) + + if entries > 1: + raise ValueError("Input FASTA contains multiple entries") + + return header, ''.join(sequence) + + +print("FASTA QC: checking similarity to the reference") + +header, sequence = read_single_fasta(path_fasta) + +similarity = 0 + +with tempfile.NamedTemporaryFile() as tmp_fasta: + with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta: + with open(path_sars_cov_2_reference_fasta, 'rb') as f: + tmp_sars_cov_2_reference_fasta.write(f.read()) + tmp_sars_cov_2_reference_fasta.flush() + + tmp_fasta.write(f'>{header}\n'.encode("utf8")) + tmp_fasta.write(sequence.encode("utf8")) + tmp_fasta.flush() + + cmd = [ + "minimap2", "-c", "-x", "asm20", + tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name + ] + print(" ".join(cmd)) + + result = subprocess.run(cmd, stdout=subprocess.PIPE) + result.check_returncode() + + paf_output = result.stdout.decode("utf-8") + paf_output = paf_output.split("\t") + if len(paf_output) >= 10: + # Number of matching bases in the mapping / Target sequence length + similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0 + +if similarity < 70.0: + raise ValueError( + f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')" + ) |