used builtin hashlib md5 for the deduplication step

author: AndreaGuarracino 2020-08-19 22:29:33 +0200
committer: AndreaGuarracino 2020-08-19 22:29:33 +0200
commit: 4fe8876d6ff17479a36ebcce564aa6983e15c490 (patch)
tree: 6529387e7c09e737a38e051173772a8700ed55e6
parent: 0cee8cc13b869ef389c941f662c3cef2409e1e61 (diff)
download: bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.gz
bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.lz
bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.zip
1 files changed, 6 insertions, 2 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index 5f75021..e836654 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -6,7 +6,9 @@
 import os
 import sys
 import gzip
-import xxhash
+
+#import xxhash # Faster library
+import hashlib
 
 def open_gzipsafe(path_file):
     if path_file.endswith('.gz'):
@@ -26,7 +28,9 @@ with open_gzipsafe(path_fasta) as f:
         header = fasta.strip('\n').split('\n')[0]
         sequence = ''.join(fasta.strip('\n').split('\n')[1:])
 
-        hash = xxhash.xxh64(sequence).hexdigest()
+        ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library
+        hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
+
         if hash not in hash_to_count_and_headers_dict:
             # New sequence
             hash_to_count_and_headers_dict[hash] = [0, []]
author	AndreaGuarracino	2020-08-19 22:29:33 +0200
committer	AndreaGuarracino	2020-08-19 22:29:33 +0200
commit	4fe8876d6ff17479a36ebcce564aa6983e15c490 (patch)
tree	6529387e7c09e737a38e051173772a8700ed55e6
parent	0cee8cc13b869ef389c941f662c3cef2409e1e61 (diff)
download	bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.gz bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.lz bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.zip