diff options
author | AndreaGuarracino | 2020-07-27 17:27:07 +0200 |
---|---|---|
committer | AndreaGuarracino | 2020-07-27 17:27:07 +0200 |
commit | 618f956eb03c6a6ad1cc16efc931f55b0dce83e1 (patch) | |
tree | 6ae7daaf0978347c999dd1fc7926896baa777e8a /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py | |
parent | e31d89f6b4c0d2a99eb6df90b85b4e51cb584817 (diff) | |
download | bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.gz bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.lz bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.zip |
added workflow to sort a multifasta by quality and length, and added the overall new pangenome generation workflow with SPOA
Diffstat (limited to 'workflows/pangenome-generate/sort_fasta_by_quality_and_len.py')
-rw-r--r-- | workflows/pangenome-generate/sort_fasta_by_quality_and_len.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py new file mode 100644 index 0000000..e48fd68 --- /dev/null +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending). +# The best sequence is the longest one, with no uncalled bases. + +import os +import sys +import gzip + +def open_gzipsafe(path_file): + if path_file.endswith('.gz'): + return gzip.open(path_file, 'rt') + else: + return open(path_file) + +path_fasta = sys.argv[1] + +header_to_seq_dict = {} +header_percCalledBases_seqLength_list = [] + +with open_gzipsafe(path_fasta) as f: + for fasta in f.read().strip('\n>').split('>'): + header = fasta.strip('\n').split('\n')[0] + + header_to_seq_dict[ + header + ] = ''.join(fasta.strip('\n').split('\n')[1:]) + + seq_len = len(header_to_seq_dict[header]) + header_percCalledBases_seqLength_list.append([ + header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len + ]) + +for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): + sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) |