aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
blob: e48fd68160bf99e77d348faffc05abea88dd30ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python3

# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
# The best sequence is the longest one, with no uncalled bases.

import os
import sys
import gzip

def open_gzipsafe(path_file):
    if path_file.endswith('.gz'):
    	return gzip.open(path_file, 'rt')
    else:
        return open(path_file)

path_fasta = sys.argv[1]

header_to_seq_dict = {}
header_percCalledBases_seqLength_list = []

with open_gzipsafe(path_fasta) as f:
    for fasta in f.read().strip('\n>').split('>'):
        header = fasta.strip('\n').split('\n')[0]

        header_to_seq_dict[
            header
        ] = ''.join(fasta.strip('\n').split('\n')[1:])

        seq_len = len(header_to_seq_dict[header])
        header_percCalledBases_seqLength_list.append([
            header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
        ])

for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
    sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))