From 0cee8cc13b869ef389c941f662c3cef2409e1e61 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 19 Aug 2020 17:59:39 +0200 Subject: integrated the deduplication step in the sorting by quality and length script --- .../pangenome-generate/pangenome-generate_spoa.cwl | 16 +++++------- .../sort_fasta_by_quality_and_len.py | 29 ++++++++++++++++------ 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index 958ffb6..8b34ff8 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -31,9 +31,9 @@ outputs: odgiRDF: type: File outputSource: odgi2rdf/rdf - readsMergeDedup: + readsMergeDedupSortedByQualAndLen: type: File - outputSource: dedup/reads_dedup + outputSource: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len mergedMetadata: type: File outputSource: mergeMetadata/merged @@ -51,17 +51,13 @@ steps: exclude: exclude out: [relabeledSeqs, originalLabels] run: relabel-seqs.cwl - dedup: + dedup_and_sort_by_quality_and_len: in: {reads: relabel/relabeledSeqs} - out: [reads_dedup, dups] - run: ../tools/seqkit/seqkit_rmdup.cwl - sort_by_quality_and_len: - in: {reads: dedup/reads_dedup} - out: [reads_sorted_by_quality_and_len] + out: [reads_dedupped_sorted_by_quality_and_len, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: in: - readsFA: sort_by_quality_and_len/reads_sorted_by_quality_and_len + readsFA: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len out: [spoaGFA] run: spoa.cwl buildGraph: @@ -90,7 +86,7 @@ steps: metadata: metadata metadataSchema: metadataSchema subjects: subjects - dups: dedup/dups + dups: dedup_and_sort_by_quality_and_len/dups originalLabels: relabel/originalLabels out: [merged] run: merge-metadata.cwl diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py index e48fd68..5f75021 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -6,6 +6,7 @@ import os import sys import gzip +import xxhash def open_gzipsafe(path_file): if path_file.endswith('.gz'): @@ -15,21 +16,33 @@ def open_gzipsafe(path_file): path_fasta = sys.argv[1] +hash_to_count_and_headers_dict = {} + header_to_seq_dict = {} header_percCalledBases_seqLength_list = [] with open_gzipsafe(path_fasta) as f: for fasta in f.read().strip('\n>').split('>'): header = fasta.strip('\n').split('\n')[0] + sequence = ''.join(fasta.strip('\n').split('\n')[1:]) + + hash = xxhash.xxh64(sequence).hexdigest() + if hash not in hash_to_count_and_headers_dict: + # New sequence + hash_to_count_and_headers_dict[hash] = [0, []] + + header_to_seq_dict[header] = sequence + + seq_len = len(sequence) + header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len]) + + hash_to_count_and_headers_dict[hash][0] += 1 + hash_to_count_and_headers_dict[hash][1].append(header) - header_to_seq_dict[ - header - ] = ''.join(fasta.strip('\n').split('\n')[1:]) - seq_len = len(header_to_seq_dict[header]) - header_percCalledBases_seqLength_list.append([ - header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len - ]) +with open('dups.txt', 'w') as fw: + for count, header_list in hash_to_count_and_headers_dict.values(): + fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n') -for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): +for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) -- cgit v1.2.3