diff options
author | AndreaGuarracino | 2020-08-19 22:29:33 +0200 |
---|---|---|
committer | AndreaGuarracino | 2020-08-19 22:29:33 +0200 |
commit | 4fe8876d6ff17479a36ebcce564aa6983e15c490 (patch) | |
tree | 6529387e7c09e737a38e051173772a8700ed55e6 /workflows/pangenome-generate | |
parent | 0cee8cc13b869ef389c941f662c3cef2409e1e61 (diff) | |
download | bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.gz bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.tar.lz bh20-seq-resource-4fe8876d6ff17479a36ebcce564aa6983e15c490.zip |
used builtin hashlib md5 for the deduplication step
Diffstat (limited to 'workflows/pangenome-generate')
-rw-r--r-- | workflows/pangenome-generate/sort_fasta_by_quality_and_len.py | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py index 5f75021..e836654 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -6,7 +6,9 @@ import os import sys import gzip -import xxhash + +#import xxhash # Faster library +import hashlib def open_gzipsafe(path_file): if path_file.endswith('.gz'): @@ -26,7 +28,9 @@ with open_gzipsafe(path_fasta) as f: header = fasta.strip('\n').split('\n')[0] sequence = ''.join(fasta.strip('\n').split('\n')[1:]) - hash = xxhash.xxh64(sequence).hexdigest() + ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library + hash = hashlib.md5(sequence.encode('utf-8')).hexdigest() + if hash not in hash_to_count_and_headers_dict: # New sequence hash_to_count_and_headers_dict[hash] = [0, []] |