From 618f956eb03c6a6ad1cc16efc931f55b0dce83e1 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 27 Jul 2020 17:27:07 +0200 Subject: added workflow to sort a multifasta by quality and length, and added the overall new pangenome generation workflow with SPOA --- .../pangenome-generate/pangenome-generate_spoa.cwl | 122 +++++++++++++++++++++ .../sort_fasta_by_quality_and_len.cwl | 18 +++ .../sort_fasta_by_quality_and_len.py | 35 ++++++ 3 files changed, 175 insertions(+) create mode 100644 workflows/pangenome-generate/pangenome-generate_spoa.cwl create mode 100644 workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl create mode 100644 workflows/pangenome-generate/sort_fasta_by_quality_and_len.py (limited to 'workflows/pangenome-generate') diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl new file mode 100644 index 0000000..958ffb6 --- /dev/null +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -0,0 +1,122 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.1 +class: Workflow +requirements: + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} +inputs: + inputReads: File[] + metadata: File[] + metadataSchema: File + subjects: string[] + exclude: File? + bin_widths: + type: int[] + default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] + doc: width of each bin in basepairs along the graph vector + cells_per_file: + type: int + default: 100 + doc: Cells per file on component_segmentation +outputs: + odgiGraph: + type: File + outputSource: buildGraph/odgiGraph + odgiPNG: + type: File + outputSource: vizGraph/graph_image + spoaGFA: + type: File + outputSource: induceGraph/spoaGFA + odgiRDF: + type: File + outputSource: odgi2rdf/rdf + readsMergeDedup: + type: File + outputSource: dedup/reads_dedup + mergedMetadata: + type: File + outputSource: mergeMetadata/merged + indexed_paths: + type: File + outputSource: index_paths/indexed_paths + colinear_components: + type: Directory + outputSource: segment_components/colinear_components +steps: + relabel: + in: + readsFA: inputReads + subjects: subjects + exclude: exclude + out: [relabeledSeqs, originalLabels] + run: relabel-seqs.cwl + dedup: + in: {reads: relabel/relabeledSeqs} + out: [reads_dedup, dups] + run: ../tools/seqkit/seqkit_rmdup.cwl + sort_by_quality_and_len: + in: {reads: dedup/reads_dedup} + out: [reads_sorted_by_quality_and_len] + run: sort_fasta_by_quality_and_len.cwl + induceGraph: + in: + readsFA: sort_by_quality_and_len/reads_sorted_by_quality_and_len + out: [spoaGFA] + run: spoa.cwl + buildGraph: + in: {inputGFA: induceGraph/spoaGFA} + out: [odgiGraph] + run: odgi-build-from-spoa-gfa.cwl + vizGraph: + in: + sparse_graph_index: buildGraph/odgiGraph + width: + default: 50000 + height: + default: 500 + path_per_row: + default: true + path_height: + default: 4 + out: [graph_image] + run: ../tools/odgi/odgi_viz.cwl + odgi2rdf: + in: {odgi: buildGraph/odgiGraph} + out: [rdf] + run: odgi_to_rdf.cwl + mergeMetadata: + in: + metadata: metadata + metadataSchema: metadataSchema + subjects: subjects + dups: dedup/dups + originalLabels: relabel/originalLabels + out: [merged] + run: merge-metadata.cwl + bin_paths: + run: ../tools/odgi/odgi_bin.cwl + in: + sparse_graph_index: buildGraph/odgiGraph + bin_width: bin_widths + scatter: bin_width + out: [ bins, pangenome_sequence ] + index_paths: + label: Create path index + run: ../tools/odgi/odgi_pathindex.cwl + in: + sparse_graph_index: buildGraph/odgiGraph + out: [ indexed_paths ] + segment_components: + label: Run component segmentation + run: ../tools/graph-genome-segmentation/component_segmentation.cwl + in: + bins: bin_paths/bins + cells_per_file: cells_per_file + pangenome_sequence: + source: bin_paths/pangenome_sequence + valueFrom: $(self[0]) + # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index + # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index + # regardless of bin_width, so we take the first pangenome_sequence as input for this step + out: [ colinear_components ] diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl new file mode 100644 index 0000000..59f027e --- /dev/null +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -0,0 +1,18 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: + type: File + inputBinding: {position: 2} + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: sort_fasta_by_quality_and_len.py} +stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta +outputs: + sortedReadsFA: + type: stdout +requirements: + InlineJavascriptRequirement: {} + ShellCommandRequirement: {} +baseCommand: [python] diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py new file mode 100644 index 0000000..e48fd68 --- /dev/null +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending). +# The best sequence is the longest one, with no uncalled bases. + +import os +import sys +import gzip + +def open_gzipsafe(path_file): + if path_file.endswith('.gz'): + return gzip.open(path_file, 'rt') + else: + return open(path_file) + +path_fasta = sys.argv[1] + +header_to_seq_dict = {} +header_percCalledBases_seqLength_list = [] + +with open_gzipsafe(path_fasta) as f: + for fasta in f.read().strip('\n>').split('>'): + header = fasta.strip('\n').split('\n')[0] + + header_to_seq_dict[ + header + ] = ''.join(fasta.strip('\n').split('\n')[1:]) + + seq_len = len(header_to_seq_dict[header]) + header_percCalledBases_seqLength_list.append([ + header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len + ]) + +for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): + sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) -- cgit v1.2.3