aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-11-21 21:21:36 +0100
committerAndreaGuarracino2020-11-21 21:21:36 +0100
commit3c35250eb6394ec8d0e28e4391ec23f2223345c8 (patch)
tree017660f818af2e460c64259a5b82477beca1822f
parent6814f9d74ca1d7fe4a7a08e5df0878557bd8e8f1 (diff)
downloadbh20-seq-resource-3c35250eb6394ec8d0e28e4391ec23f2223345c8.tar.gz
bh20-seq-resource-3c35250eb6394ec8d0e28e4391ec23f2223345c8.tar.lz
bh20-seq-resource-3c35250eb6394ec8d0e28e4391ec23f2223345c8.zip
generalized spoa workflow
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl2
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl2
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py17
3 files changed, 14 insertions, 7 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
index b623054..a46524a 100644
--- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -42,7 +42,7 @@ outputs:
# outputSource: segment_components/colinear_components
steps:
dedup_and_sort_by_quality_and_len:
- in: {readsFA: seqs}
+ in: {readsFA: seqs, reversed_sorting: 'true'}
out: [sortedReadsFA, dups]
run: sort_fasta_by_quality_and_len.cwl
induceGraph:
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
index 26c20e8..f8df786 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
@@ -5,6 +5,8 @@ hints:
coresMin: 1
ramMin: 3000
inputs:
+ reversed_sorting:
+ inputBinding: {position: 3}
readsFA:
type: File
inputBinding: {position: 2}
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e836654..02ebf60 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -7,15 +7,17 @@ import os
import sys
import gzip
-#import xxhash # Faster library
+# import xxhash # Faster library
import hashlib
+
def open_gzipsafe(path_file):
if path_file.endswith('.gz'):
- return gzip.open(path_file, 'rt')
+ return gzip.open(path_file, 'rt')
else:
return open(path_file)
+
path_fasta = sys.argv[1]
hash_to_count_and_headers_dict = {}
@@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f:
header = fasta.strip('\n').split('\n')[0]
sequence = ''.join(fasta.strip('\n').split('\n')[1:])
- ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library
+ # hash = xxhash.xxh64(sequence).hexdigest() # Faster library
hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
if hash not in hash_to_count_and_headers_dict:
@@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f:
header_to_seq_dict[header] = sequence
seq_len = len(sequence)
- header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+ header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len])
hash_to_count_and_headers_dict[hash][0] += 1
hash_to_count_and_headers_dict[hash][1].append(header)
-
with open('dups.txt', 'w') as fw:
for count, header_list in hash_to_count_and_headers_dict.values():
fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
-for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False
+
+for header, percCalledBases, seqLength_list in sorted(
+ header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting
+):
sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))