aboutsummaryrefslogtreecommitdiff
path: root/workflows/yamlfa2ttl/check_sequence.py
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/yamlfa2ttl/check_sequence.py')
-rw-r--r--workflows/yamlfa2ttl/check_sequence.py63
1 files changed, 63 insertions, 0 deletions
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
new file mode 100644
index 0000000..f92bf6d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -0,0 +1,63 @@
+import subprocess
+import tempfile
+import sys
+
+path_fasta = sys.argv[1]
+path_sars_cov_2_reference_fasta = sys.argv[2]
+
+
+def read_single_fasta(path_fasta):
+ with open(path_fasta) as f:
+ entries = 0
+ header = None
+ sequence = []
+ for line in f:
+ line = line.strip()
+
+ if line.startswith(">"):
+ header = line
+ entries += 1
+ else:
+ sequence.append(line)
+
+ if entries > 1:
+ raise ValueError("Input FASTA contains multiple entries")
+
+ return header, ''.join(sequence)
+
+
+print("FASTA QC: checking similarity to the reference")
+
+header, sequence = read_single_fasta(path_fasta)
+
+similarity = 0
+
+with tempfile.NamedTemporaryFile() as tmp_fasta:
+ with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
+ with open(path_sars_cov_2_reference_fasta, 'rb') as f:
+ tmp_sars_cov_2_reference_fasta.write(f.read())
+ tmp_sars_cov_2_reference_fasta.flush()
+
+ tmp_fasta.write(f'>{header}\n'.encode("utf8"))
+ tmp_fasta.write(sequence.encode("utf8"))
+ tmp_fasta.flush()
+
+ cmd = [
+ "minimap2", "-c", "-x", "asm20",
+ tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
+ ]
+ print(" ".join(cmd))
+
+ result = subprocess.run(cmd, stdout=subprocess.PIPE)
+ result.check_returncode()
+
+ paf_output = result.stdout.decode("utf-8")
+ paf_output = paf_output.split("\t")
+ if len(paf_output) >= 10:
+ # Number of matching bases in the mapping / Target sequence length
+ similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0
+
+if similarity < 70.0:
+ raise ValueError(
+ f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
+ )