Merge pull request #117 from arvados/pangenome_workflow_abpoa

Pangenome workflow with abPOA
author: Pjotr Prins 2020-12-24 09:15:39 +0000
committer: GitHub 2020-12-24 09:15:39 +0000
commit: df0421757e464f07b5e96b5444f1926784e7400f (patch)
tree: f9a9a262bb2d95a89a6ec1c96b98a2b166fb92b5 /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent: 00ba74b163f723bb7283624171f0c7c203dc99e5 (diff)
parent: bfd830f9d777c456409958030142155043ec1c68 (diff)
download: bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.gz
bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.lz
bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.zip
1 files changed, 11 insertions, 6 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e836654..02ebf60 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -7,15 +7,17 @@ import os
 import sys
 import gzip
 
-#import xxhash # Faster library
+# import xxhash # Faster library
 import hashlib
 
+
 def open_gzipsafe(path_file):
     if path_file.endswith('.gz'):
-    	return gzip.open(path_file, 'rt')
+        return gzip.open(path_file, 'rt')
     else:
         return open(path_file)
 
+
 path_fasta = sys.argv[1]
 
 hash_to_count_and_headers_dict = {}
@@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f:
         header = fasta.strip('\n').split('\n')[0]
         sequence = ''.join(fasta.strip('\n').split('\n')[1:])
 
-        ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library
+        # hash = xxhash.xxh64(sequence).hexdigest() # Faster library
         hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
 
         if hash not in hash_to_count_and_headers_dict:
@@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f:
             header_to_seq_dict[header] = sequence
 
             seq_len = len(sequence)
-            header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+            header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len])
 
         hash_to_count_and_headers_dict[hash][0] += 1
         hash_to_count_and_headers_dict[hash][1].append(header)
 
-
 with open('dups.txt', 'w') as fw:
     for count, header_list in hash_to_count_and_headers_dict.values():
         fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
 
-for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False
+
+for header, percCalledBases, seqLength_list in sorted(
+        header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting
+):
     sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))
author	Pjotr Prins	2020-12-24 09:15:39 +0000
committer	GitHub	2020-12-24 09:15:39 +0000
commit	df0421757e464f07b5e96b5444f1926784e7400f (patch)
tree	f9a9a262bb2d95a89a6ec1c96b98a2b166fb92b5 /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent	00ba74b163f723bb7283624171f0c7c203dc99e5 (diff)
parent	bfd830f9d777c456409958030142155043ec1c68 (diff)
download	bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.gz bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.lz bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.zip