From 618f956eb03c6a6ad1cc16efc931f55b0dce83e1 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Mon, 27 Jul 2020 17:27:07 +0200
Subject: added workflow to sort a multifasta by quality and length, and added
 the overall new pangenome generation workflow with SPOA

---
 .../sort_fasta_by_quality_and_len.py               | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 workflows/pangenome-generate/sort_fasta_by_quality_and_len.py

(limited to 'workflows/pangenome-generate/sort_fasta_by_quality_and_len.py')

diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
new file mode 100644
index 0000000..e48fd68
--- /dev/null
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
+# The best sequence is the longest one, with no uncalled bases.
+
+import os
+import sys
+import gzip
+
+def open_gzipsafe(path_file):
+    if path_file.endswith('.gz'):
+    	return gzip.open(path_file, 'rt')
+    else:
+        return open(path_file)
+
+path_fasta = sys.argv[1]
+
+header_to_seq_dict = {}
+header_percCalledBases_seqLength_list = []
+
+with open_gzipsafe(path_fasta) as f:
+    for fasta in f.read().strip('\n>').split('>'):
+        header = fasta.strip('\n').split('\n')[0]
+
+        header_to_seq_dict[
+            header
+        ] = ''.join(fasta.strip('\n').split('\n')[1:])
+
+        seq_len = len(header_to_seq_dict[header])
+        header_percCalledBases_seqLength_list.append([
+            header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
+        ])
+
+for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+    sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))
-- 
cgit v1.2.3