aboutsummaryrefslogtreecommitdiff
path: root/workflows
diff options
context:
space:
mode:
authorPeter Amstutz2020-08-19 16:30:57 -0400
committerGitHub2020-08-19 16:30:57 -0400
commit592c921a3223c03d8a22f7a852641ac5d753fb31 (patch)
tree6529387e7c09e737a38e051173772a8700ed55e6 /workflows
parent795d022f4a876ae1cd7df54173fa3927969afe8d (diff)
parent4fe8876d6ff17479a36ebcce564aa6983e15c490 (diff)
downloadbh20-seq-resource-592c921a3223c03d8a22f7a852641ac5d753fb31.tar.gz
bh20-seq-resource-592c921a3223c03d8a22f7a852641ac5d753fb31.tar.lz
bh20-seq-resource-592c921a3223c03d8a22f7a852641ac5d753fb31.zip
Merge pull request #101 from AndreaGuarracino/patch-3
used builtin hashlib md5 for the deduplication step
Diffstat (limited to 'workflows')
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index 5f75021..e836654 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -6,7 +6,9 @@
import os
import sys
import gzip
-import xxhash
+
+#import xxhash # Faster library
+import hashlib
def open_gzipsafe(path_file):
if path_file.endswith('.gz'):
@@ -26,7 +28,9 @@ with open_gzipsafe(path_fasta) as f:
header = fasta.strip('\n').split('\n')[0]
sequence = ''.join(fasta.strip('\n').split('\n')[1:])
- hash = xxhash.xxh64(sequence).hexdigest()
+ ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library
+ hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
+
if hash not in hash_to_count_and_headers_dict:
# New sequence
hash_to_count_and_headers_dict[hash] = [0, []]