aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
diff options
context:
space:
mode:
authorAndreaGuarracino2020-08-19 17:59:39 +0200
committerAndreaGuarracino2020-08-19 17:59:39 +0200
commit0cee8cc13b869ef389c941f662c3cef2409e1e61 (patch)
tree734d620bc7ad8803cc459e605fcdb19b52830227 /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
parentfdb1b012fc04ee07f401541e181e28fe442c9454 (diff)
downloadbh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.gz
bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.tar.lz
bh20-seq-resource-0cee8cc13b869ef389c941f662c3cef2409e1e61.zip
integrated the deduplication step in the sorting by quality and length script
Diffstat (limited to 'workflows/pangenome-generate/sort_fasta_by_quality_and_len.py')
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py29
1 files changed, 21 insertions, 8 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e48fd68..5f75021 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -6,6 +6,7 @@
import os
import sys
import gzip
+import xxhash
def open_gzipsafe(path_file):
if path_file.endswith('.gz'):
@@ -15,21 +16,33 @@ def open_gzipsafe(path_file):
path_fasta = sys.argv[1]
+hash_to_count_and_headers_dict = {}
+
header_to_seq_dict = {}
header_percCalledBases_seqLength_list = []
with open_gzipsafe(path_fasta) as f:
for fasta in f.read().strip('\n>').split('>'):
header = fasta.strip('\n').split('\n')[0]
+ sequence = ''.join(fasta.strip('\n').split('\n')[1:])
+
+ hash = xxhash.xxh64(sequence).hexdigest()
+ if hash not in hash_to_count_and_headers_dict:
+ # New sequence
+ hash_to_count_and_headers_dict[hash] = [0, []]
+
+ header_to_seq_dict[header] = sequence
+
+ seq_len = len(sequence)
+ header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+
+ hash_to_count_and_headers_dict[hash][0] += 1
+ hash_to_count_and_headers_dict[hash][1].append(header)
- header_to_seq_dict[
- header
- ] = ''.join(fasta.strip('\n').split('\n')[1:])
- seq_len = len(header_to_seq_dict[header])
- header_percCalledBases_seqLength_list.append([
- header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
- ])
+with open('dups.txt', 'w') as fw:
+ for count, header_list in hash_to_count_and_headers_dict.values():
+ fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
-for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))