aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-08-19 15:14:45 -0400
committerGitHub2020-08-19 15:14:45 -0400
commit795d022f4a876ae1cd7df54173fa3927969afe8d (patch)
tree734d620bc7ad8803cc459e605fcdb19b52830227
parentfdb1b012fc04ee07f401541e181e28fe442c9454 (diff)
parent0cee8cc13b869ef389c941f662c3cef2409e1e61 (diff)
downloadbh20-seq-resource-795d022f4a876ae1cd7df54173fa3927969afe8d.tar.gz
bh20-seq-resource-795d022f4a876ae1cd7df54173fa3927969afe8d.tar.lz
bh20-seq-resource-795d022f4a876ae1cd7df54173fa3927969afe8d.zip
Merge pull request #100 from AndreaGuarracino/patch-3
integrated the deduplication step in the sorting by quality and length script
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl16
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py29
2 files changed, 27 insertions, 18 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
index 958ffb6..8b34ff8 100644
--- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -31,9 +31,9 @@ outputs:
odgiRDF:
type: File
outputSource: odgi2rdf/rdf
- readsMergeDedup:
+ readsMergeDedupSortedByQualAndLen:
type: File
- outputSource: dedup/reads_dedup
+ outputSource: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len
mergedMetadata:
type: File
outputSource: mergeMetadata/merged
@@ -51,17 +51,13 @@ steps:
exclude: exclude
out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
- dedup:
+ dedup_and_sort_by_quality_and_len:
in: {reads: relabel/relabeledSeqs}
- out: [reads_dedup, dups]
- run: ../tools/seqkit/seqkit_rmdup.cwl
- sort_by_quality_and_len:
- in: {reads: dedup/reads_dedup}
- out: [reads_sorted_by_quality_and_len]
+ out: [reads_dedupped_sorted_by_quality_and_len, dups]
run: sort_fasta_by_quality_and_len.cwl
induceGraph:
in:
- readsFA: sort_by_quality_and_len/reads_sorted_by_quality_and_len
+ readsFA: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len
out: [spoaGFA]
run: spoa.cwl
buildGraph:
@@ -90,7 +86,7 @@ steps:
metadata: metadata
metadataSchema: metadataSchema
subjects: subjects
- dups: dedup/dups
+ dups: dedup_and_sort_by_quality_and_len/dups
originalLabels: relabel/originalLabels
out: [merged]
run: merge-metadata.cwl
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e48fd68..5f75021 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -6,6 +6,7 @@
import os
import sys
import gzip
+import xxhash
def open_gzipsafe(path_file):
if path_file.endswith('.gz'):
@@ -15,21 +16,33 @@ def open_gzipsafe(path_file):
path_fasta = sys.argv[1]
+hash_to_count_and_headers_dict = {}
+
header_to_seq_dict = {}
header_percCalledBases_seqLength_list = []
with open_gzipsafe(path_fasta) as f:
for fasta in f.read().strip('\n>').split('>'):
header = fasta.strip('\n').split('\n')[0]
+ sequence = ''.join(fasta.strip('\n').split('\n')[1:])
+
+ hash = xxhash.xxh64(sequence).hexdigest()
+ if hash not in hash_to_count_and_headers_dict:
+ # New sequence
+ hash_to_count_and_headers_dict[hash] = [0, []]
+
+ header_to_seq_dict[header] = sequence
+
+ seq_len = len(sequence)
+ header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+
+ hash_to_count_and_headers_dict[hash][0] += 1
+ hash_to_count_and_headers_dict[hash][1].append(header)
- header_to_seq_dict[
- header
- ] = ''.join(fasta.strip('\n').split('\n')[1:])
- seq_len = len(header_to_seq_dict[header])
- header_percCalledBases_seqLength_list.append([
- header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
- ])
+with open('dups.txt', 'w') as fw:
+ for count, header_list in hash_to_count_and_headers_dict.values():
+ fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
-for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))