integrated the deduplication step in the sorting by quality and length script

author: AndreaGuarracino 2020-08-19 17:59:39 +0200
committer: AndreaGuarracino 2020-08-19 17:59:39 +0200
commit: 0cee8cc13b869ef389c941f662c3cef2409e1e61 (patch)
tree: 734d620bc7ad8803cc459e605fcdb19b52830227 /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent: fdb1b012fc04ee07f401541e181e28fe442c9454 (diff)
download: bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.gz
bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.lz
bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.zip
1 files changed, 21 insertions, 8 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e48fd68..5f75021 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -6,6 +6,7 @@
 import os
 import sys
 import gzip
+import xxhash
 
 def open_gzipsafe(path_file):
     if path_file.endswith('.gz'):
@@ -15,21 +16,33 @@ def open_gzipsafe(path_file):
 
 path_fasta = sys.argv[1]
 
+hash_to_count_and_headers_dict = {}
+
 header_to_seq_dict = {}
 header_percCalledBases_seqLength_list = []
 
 with open_gzipsafe(path_fasta) as f:
     for fasta in f.read().strip('\n>').split('>'):
         header = fasta.strip('\n').split('\n')[0]
+        sequence = ''.join(fasta.strip('\n').split('\n')[1:])
+
+        hash = xxhash.xxh64(sequence).hexdigest()
+        if hash not in hash_to_count_and_headers_dict:
+            # New sequence
+            hash_to_count_and_headers_dict[hash] = [0, []]
+
+            header_to_seq_dict[header] = sequence
+
+            seq_len = len(sequence)
+            header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+
+        hash_to_count_and_headers_dict[hash][0] += 1
+        hash_to_count_and_headers_dict[hash][1].append(header)
 
-        header_to_seq_dict[
-            header
-        ] = ''.join(fasta.strip('\n').split('\n')[1:])
 
-        seq_len = len(header_to_seq_dict[header])
-        header_percCalledBases_seqLength_list.append([
-            header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
-        ])
+with open('dups.txt', 'w') as fw:
+    for count, header_list in hash_to_count_and_headers_dict.values():
+        fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
 
-for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
     sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))
author	AndreaGuarracino	2020-08-19 17:59:39 +0200
committer	AndreaGuarracino	2020-08-19 17:59:39 +0200
commit	0cee8cc13b869ef389c941f662c3cef2409e1e61 (patch)
tree	734d620bc7ad8803cc459e605fcdb19b52830227 /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parent	fdb1b012fc04ee07f401541e181e28fe442c9454 (diff)
download	bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.gz bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.lz bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.zip