aboutsummaryrefslogtreecommitdiff
path: root/workflows/yamlfa2ttl/check_sequence.py
blob: f92bf6d5f86ed9fdf0e4bf9a9206f2dcd0e199ff (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import subprocess
import tempfile
import sys

path_fasta = sys.argv[1]
path_sars_cov_2_reference_fasta = sys.argv[2]


def read_single_fasta(path_fasta):
    with open(path_fasta) as f:
        entries = 0
        header = None
        sequence = []
        for line in f:
            line = line.strip()

            if line.startswith(">"):
                header = line
                entries += 1
            else:
                sequence.append(line)

            if entries > 1:
                raise ValueError("Input FASTA contains multiple entries")

    return header, ''.join(sequence)


print("FASTA QC: checking similarity to the reference")

header, sequence = read_single_fasta(path_fasta)

similarity = 0

with tempfile.NamedTemporaryFile() as tmp_fasta:
    with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
        with open(path_sars_cov_2_reference_fasta, 'rb') as f:
            tmp_sars_cov_2_reference_fasta.write(f.read())
        tmp_sars_cov_2_reference_fasta.flush()

        tmp_fasta.write(f'>{header}\n'.encode("utf8"))
        tmp_fasta.write(sequence.encode("utf8"))
        tmp_fasta.flush()

        cmd = [
            "minimap2", "-c", "-x", "asm20",
            tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
        ]
        print(" ".join(cmd))

        result = subprocess.run(cmd, stdout=subprocess.PIPE)
        result.check_returncode()

        paf_output = result.stdout.decode("utf-8")
        paf_output = paf_output.split("\t")
        if len(paf_output) >= 10:
            # Number of matching bases in the mapping / Target sequence length
            similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0

if similarity < 70.0:
    raise ValueError(
        f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
    )