From 618f956eb03c6a6ad1cc16efc931f55b0dce83e1 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 27 Jul 2020 17:27:07 +0200 Subject: added workflow to sort a multifasta by quality and length, and added the overall new pangenome generation workflow with SPOA --- .../sort_fasta_by_quality_and_len.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 workflows/pangenome-generate/sort_fasta_by_quality_and_len.py (limited to 'workflows/pangenome-generate/sort_fasta_by_quality_and_len.py') diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py new file mode 100644 index 0000000..e48fd68 --- /dev/null +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending). +# The best sequence is the longest one, with no uncalled bases. + +import os +import sys +import gzip + +def open_gzipsafe(path_file): + if path_file.endswith('.gz'): + return gzip.open(path_file, 'rt') + else: + return open(path_file) + +path_fasta = sys.argv[1] + +header_to_seq_dict = {} +header_percCalledBases_seqLength_list = [] + +with open_gzipsafe(path_fasta) as f: + for fasta in f.read().strip('\n>').split('>'): + header = fasta.strip('\n').split('\n')[0] + + header_to_seq_dict[ + header + ] = ''.join(fasta.strip('\n').split('\n')[1:]) + + seq_len = len(header_to_seq_dict[header]) + header_percCalledBases_seqLength_list.append([ + header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len + ]) + +for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): + sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) -- cgit v1.2.3