blob: 58a65b9b4eafac2e77c8ffe9b6376e9b276a78ff (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
import subprocess
import tempfile
import sys
path_fasta = sys.argv[1]
path_sars_cov_2_reference_fasta = sys.argv[2]
def read_single_fasta(path_fasta):
with open(path_fasta) as f:
entries = 0
header = None
sequence = []
for line in f:
line = line.strip()
if line.startswith(">"):
header = line
entries += 1
else:
sequence.append(line)
if entries > 1:
raise ValueError("Input FASTA contains multiple entries")
return header, ''.join(sequence)
print("FASTA QC: checking similarity to the reference", file=sys.stderr)
header, sequence = read_single_fasta(path_fasta)
similarity = 0
with tempfile.NamedTemporaryFile() as tmp_fasta:
with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
with open(path_sars_cov_2_reference_fasta, 'rb') as f:
tmp_sars_cov_2_reference_fasta.write(f.read())
tmp_sars_cov_2_reference_fasta.flush()
tmp_fasta.write(f'>{header}\n'.encode("utf8"))
tmp_fasta.write(sequence.encode("utf8"))
tmp_fasta.flush()
cmd = [
"minimap2", "-c", "-x", "asm20",
tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
]
print(" ".join(cmd), file=sys.stderr)
result = subprocess.run(cmd, stdout=subprocess.PIPE)
result.check_returncode()
paf_output = result.stdout.decode("utf-8")
paf_output = paf_output.split("\t")
if len(paf_output) >= 10:
# Number of matching bases in the mapping / Target sequence length
similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0
if similarity < 70.0:
raise ValueError(
f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
)
|