From 3c35250eb6394ec8d0e28e4391ec23f2223345c8 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 21 Nov 2020 21:21:36 +0100 Subject: generalized spoa workflow --- .../pangenome-generate/pangenome-generate_spoa.cwl | 2 +- .../sort_fasta_by_quality_and_len.cwl | 2 ++ .../pangenome-generate/sort_fasta_by_quality_and_len.py | 17 +++++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index b623054..a46524a 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -42,7 +42,7 @@ outputs: # outputSource: segment_components/colinear_components steps: dedup_and_sort_by_quality_and_len: - in: {readsFA: seqs} + in: {readsFA: seqs, reversed_sorting: 'true'} out: [sortedReadsFA, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index 26c20e8..f8df786 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -5,6 +5,8 @@ hints: coresMin: 1 ramMin: 3000 inputs: + reversed_sorting: + inputBinding: {position: 3} readsFA: type: File inputBinding: {position: 2} diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py index e836654..02ebf60 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -7,15 +7,17 @@ import os import sys import gzip -#import xxhash # Faster library +# import xxhash # Faster library import hashlib + def open_gzipsafe(path_file): if path_file.endswith('.gz'): - return gzip.open(path_file, 'rt') + return gzip.open(path_file, 'rt') else: return open(path_file) + path_fasta = sys.argv[1] hash_to_count_and_headers_dict = {} @@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f: header = fasta.strip('\n').split('\n')[0] sequence = ''.join(fasta.strip('\n').split('\n')[1:]) - ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library + # hash = xxhash.xxh64(sequence).hexdigest() # Faster library hash = hashlib.md5(sequence.encode('utf-8')).hexdigest() if hash not in hash_to_count_and_headers_dict: @@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f: header_to_seq_dict[header] = sequence seq_len = len(sequence) - header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len]) + header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len]) hash_to_count_and_headers_dict[hash][0] += 1 hash_to_count_and_headers_dict[hash][1].append(header) - with open('dups.txt', 'w') as fw: for count, header_list in hash_to_count_and_headers_dict.values(): fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n') -for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): +reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False + +for header, percCalledBases, seqLength_list in sorted( + header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting +): sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) -- cgit v1.2.3