workflows/pangenome-generate/sort_fasta_by_quality_and_len.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/usr/bin/env python3

# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
# The best sequence is the longest one, with no uncalled bases.

import os
import sys
import gzip

# import xxhash # Faster library
import hashlib


def open_gzipsafe(path_file):
    if path_file.endswith('.gz'):
        return gzip.open(path_file, 'rt')
    else:
        return open(path_file)


path_fasta = sys.argv[1]

hash_to_count_and_headers_dict = {}

header_to_seq_dict = {}
header_percCalledBases_seqLength_list = []

with open_gzipsafe(path_fasta) as f:
    for fasta in f.read().strip('\n>').split('>'):
        header = fasta.strip('\n').split('\n')[0]
        sequence = ''.join(fasta.strip('\n').split('\n')[1:])

        # hash = xxhash.xxh64(sequence).hexdigest() # Faster library
        hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()

        if hash not in hash_to_count_and_headers_dict:
            # New sequence
            hash_to_count_and_headers_dict[hash] = [0, []]

            header_to_seq_dict[header] = sequence

            seq_len = len(sequence)
            header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len])

        hash_to_count_and_headers_dict[hash][0] += 1
        hash_to_count_and_headers_dict[hash][1].append(header)

with open('dups.txt', 'w') as fw:
    for count, header_list in hash_to_count_and_headers_dict.values():
        fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')

reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False

for header, percCalledBases, seqLength_list in sorted(
        header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting
):
    sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))