added workflow to sort a multifasta by quality and length, and added the overall new pangenome generation workflow with SPOA

author: AndreaGuarracino 2020-07-27 17:27:07 +0200
committer: AndreaGuarracino 2020-07-27 17:27:07 +0200
commit: 618f956eb03c6a6ad1cc16efc931f55b0dce83e1 (patch)
tree: 6ae7daaf0978347c999dd1fc7926896baa777e8a /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent: e31d89f6b4c0d2a99eb6df90b85b4e51cb584817 (diff)
download: bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.gz
bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.lz
bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.zip
1 files changed, 35 insertions, 0 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
new file mode 100644
index 0000000..e48fd68
--- /dev/null
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
+# The best sequence is the longest one, with no uncalled bases.
+
+import os
+import sys
+import gzip
+
+def open_gzipsafe(path_file):
+    if path_file.endswith('.gz'):
+    	return gzip.open(path_file, 'rt')
+    else:
+        return open(path_file)
+
+path_fasta = sys.argv[1]
+
+header_to_seq_dict = {}
+header_percCalledBases_seqLength_list = []
+
+with open_gzipsafe(path_fasta) as f:
+    for fasta in f.read().strip('\n>').split('>'):
+        header = fasta.strip('\n').split('\n')[0]
+
+        header_to_seq_dict[
+            header
+        ] = ''.join(fasta.strip('\n').split('\n')[1:])
+
+        seq_len = len(header_to_seq_dict[header])
+        header_percCalledBases_seqLength_list.append([
+            header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
+        ])
+
+for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+    sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))
author	AndreaGuarracino	2020-07-27 17:27:07 +0200
committer	AndreaGuarracino	2020-07-27 17:27:07 +0200
commit	618f956eb03c6a6ad1cc16efc931f55b0dce83e1 (patch)
tree	6ae7daaf0978347c999dd1fc7926896baa777e8a /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent	e31d89f6b4c0d2a99eb6df90b85b4e51cb584817 (diff)
download	bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.gz bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.lz bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.zip