aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-07-27 17:27:07 +0200
committerAndreaGuarracino2020-07-27 17:27:07 +0200
commit618f956eb03c6a6ad1cc16efc931f55b0dce83e1 (patch)
tree6ae7daaf0978347c999dd1fc7926896baa777e8a
parente31d89f6b4c0d2a99eb6df90b85b4e51cb584817 (diff)
downloadbh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.gz
bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.tar.lz
bh20-seq-resource-618f956eb03c6a6ad1cc16efc931f55b0dce83e1.zip
added workflow to sort a multifasta by quality and length, and added the overall new pangenome generation workflow with SPOA
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl122
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl18
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py35
3 files changed, 175 insertions, 0 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
new file mode 100644
index 0000000..958ffb6
--- /dev/null
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -0,0 +1,122 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.1
+class: Workflow
+requirements:
+ ScatterFeatureRequirement: {}
+ StepInputExpressionRequirement: {}
+inputs:
+ inputReads: File[]
+ metadata: File[]
+ metadataSchema: File
+ subjects: string[]
+ exclude: File?
+ bin_widths:
+ type: int[]
+ default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
+ doc: width of each bin in basepairs along the graph vector
+ cells_per_file:
+ type: int
+ default: 100
+ doc: Cells per file on component_segmentation
+outputs:
+ odgiGraph:
+ type: File
+ outputSource: buildGraph/odgiGraph
+ odgiPNG:
+ type: File
+ outputSource: vizGraph/graph_image
+ spoaGFA:
+ type: File
+ outputSource: induceGraph/spoaGFA
+ odgiRDF:
+ type: File
+ outputSource: odgi2rdf/rdf
+ readsMergeDedup:
+ type: File
+ outputSource: dedup/reads_dedup
+ mergedMetadata:
+ type: File
+ outputSource: mergeMetadata/merged
+ indexed_paths:
+ type: File
+ outputSource: index_paths/indexed_paths
+ colinear_components:
+ type: Directory
+ outputSource: segment_components/colinear_components
+steps:
+ relabel:
+ in:
+ readsFA: inputReads
+ subjects: subjects
+ exclude: exclude
+ out: [relabeledSeqs, originalLabels]
+ run: relabel-seqs.cwl
+ dedup:
+ in: {reads: relabel/relabeledSeqs}
+ out: [reads_dedup, dups]
+ run: ../tools/seqkit/seqkit_rmdup.cwl
+ sort_by_quality_and_len:
+ in: {reads: dedup/reads_dedup}
+ out: [reads_sorted_by_quality_and_len]
+ run: sort_fasta_by_quality_and_len.cwl
+ induceGraph:
+ in:
+ readsFA: sort_by_quality_and_len/reads_sorted_by_quality_and_len
+ out: [spoaGFA]
+ run: spoa.cwl
+ buildGraph:
+ in: {inputGFA: induceGraph/spoaGFA}
+ out: [odgiGraph]
+ run: odgi-build-from-spoa-gfa.cwl
+ vizGraph:
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ width:
+ default: 50000
+ height:
+ default: 500
+ path_per_row:
+ default: true
+ path_height:
+ default: 4
+ out: [graph_image]
+ run: ../tools/odgi/odgi_viz.cwl
+ odgi2rdf:
+ in: {odgi: buildGraph/odgiGraph}
+ out: [rdf]
+ run: odgi_to_rdf.cwl
+ mergeMetadata:
+ in:
+ metadata: metadata
+ metadataSchema: metadataSchema
+ subjects: subjects
+ dups: dedup/dups
+ originalLabels: relabel/originalLabels
+ out: [merged]
+ run: merge-metadata.cwl
+ bin_paths:
+ run: ../tools/odgi/odgi_bin.cwl
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ bin_width: bin_widths
+ scatter: bin_width
+ out: [ bins, pangenome_sequence ]
+ index_paths:
+ label: Create path index
+ run: ../tools/odgi/odgi_pathindex.cwl
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ out: [ indexed_paths ]
+ segment_components:
+ label: Run component segmentation
+ run: ../tools/graph-genome-segmentation/component_segmentation.cwl
+ in:
+ bins: bin_paths/bins
+ cells_per_file: cells_per_file
+ pangenome_sequence:
+ source: bin_paths/pangenome_sequence
+ valueFrom: $(self[0])
+ # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
+ # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
+ # regardless of bin_width, so we take the first pangenome_sequence as input for this step
+ out: [ colinear_components ]
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
new file mode 100644
index 0000000..59f027e
--- /dev/null
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
@@ -0,0 +1,18 @@
+cwlVersion: v1.1
+class: CommandLineTool
+inputs:
+ readsFA:
+ type: File
+ inputBinding: {position: 2}
+ script:
+ type: File
+ inputBinding: {position: 1}
+ default: {class: File, location: sort_fasta_by_quality_and_len.py}
+stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta
+outputs:
+ sortedReadsFA:
+ type: stdout
+requirements:
+ InlineJavascriptRequirement: {}
+ ShellCommandRequirement: {}
+baseCommand: [python]
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
new file mode 100644
index 0000000..e48fd68
--- /dev/null
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
+# The best sequence is the longest one, with no uncalled bases.
+
+import os
+import sys
+import gzip
+
+def open_gzipsafe(path_file):
+ if path_file.endswith('.gz'):
+ return gzip.open(path_file, 'rt')
+ else:
+ return open(path_file)
+
+path_fasta = sys.argv[1]
+
+header_to_seq_dict = {}
+header_percCalledBases_seqLength_list = []
+
+with open_gzipsafe(path_fasta) as f:
+ for fasta in f.read().strip('\n>').split('>'):
+ header = fasta.strip('\n').split('\n')[0]
+
+ header_to_seq_dict[
+ header
+ ] = ''.join(fasta.strip('\n').split('\n')[1:])
+
+ seq_len = len(header_to_seq_dict[header])
+ header_percCalledBases_seqLength_list.append([
+ header, header_to_seq_dict[header].count('N'), (seq_len - header_to_seq_dict[header].count('N'))/seq_len, seq_len
+ ])
+
+for header, x, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+ sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))