diff options
author | Pjotr Prins | 2020-12-24 09:15:39 +0000 |
---|---|---|
committer | GitHub | 2020-12-24 09:15:39 +0000 |
commit | df0421757e464f07b5e96b5444f1926784e7400f (patch) | |
tree | f9a9a262bb2d95a89a6ec1c96b98a2b166fb92b5 /workflows | |
parent | 00ba74b163f723bb7283624171f0c7c203dc99e5 (diff) | |
parent | bfd830f9d777c456409958030142155043ec1c68 (diff) | |
download | bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.gz bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.lz bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.zip |
Merge pull request #117 from arvados/pangenome_workflow_abpoa
Pangenome workflow with abPOA
Diffstat (limited to 'workflows')
-rw-r--r-- | workflows/pangenome-generate/abpoa.cwl | 26 | ||||
-rw-r--r-- | workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl (renamed from workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl) | 0 | ||||
-rw-r--r-- | workflows/pangenome-generate/pangenome-generate_abpoa.cwl | 120 | ||||
-rw-r--r-- | workflows/pangenome-generate/pangenome-generate_spoa.cwl | 7 | ||||
-rw-r--r-- | workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl | 5 | ||||
-rw-r--r-- | workflows/pangenome-generate/sort_fasta_by_quality_and_len.py | 17 |
6 files changed, 167 insertions, 8 deletions
diff --git a/workflows/pangenome-generate/abpoa.cwl b/workflows/pangenome-generate/abpoa.cwl new file mode 100644 index 0000000..fa9f157 --- /dev/null +++ b/workflows/pangenome-generate/abpoa.cwl @@ -0,0 +1,26 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: File + script: + type: File + default: {class: File, location: relabel-seqs.py} +outputs: + abpoaGFA: + type: stdout +requirements: + InlineJavascriptRequirement: {} +hints: + DockerRequirement: + dockerPull: "quay.io/biocontainers/abpoa:1.0.5--hed695b0_0" + ResourceRequirement: + coresMin: 1 + ramMin: $(15 * 1024) + outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) +baseCommand: abpoa +stdout: $(inputs.readsFA.nameroot).O0.gfa +arguments: [ + $(inputs.readsFA), + -r 3, + -O, '0' +] diff --git a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl index eee4031..eee4031 100644 --- a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl +++ b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl diff --git a/workflows/pangenome-generate/pangenome-generate_abpoa.cwl b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl new file mode 100644 index 0000000..c61e51a --- /dev/null +++ b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl @@ -0,0 +1,120 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.1 +class: Workflow +requirements: + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} +inputs: + seqs: File + metadata: File + bin_widths: + type: int[] + default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] + doc: width of each bin in basepairs along the graph vector + cells_per_file: + type: int + default: 100 + doc: Cells per file on component_segmentation + reversed_sort: + type: string + default: "false" +outputs: + odgiGraph: + type: File + outputSource: buildGraph/odgiGraph + odgiPNG: + type: File + outputSource: vizGraph/graph_image + abpoaGFA: + type: File + outputSource: induceGraph/abpoaGFA +# odgiRDF: +# type: File +# outputSource: odgi2rdf/rdf + readsMergeDedupSortedByQualAndLen: + type: File + outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA + mergedMetadata: + type: File + outputSource: dups2metadata/merged +# indexed_paths: +# type: File +# outputSource: index_paths/indexed_paths +# colinear_components: +# type: Directory +# outputSource: segment_components/colinear_components +steps: + dedup_and_sort_by_quality_and_len: + in: {readsFA: seqs, reversed_sorting: reversed_sort} + out: [sortedReadsFA, dups] + run: sort_fasta_by_quality_and_len.cwl + induceGraph: + in: + readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA + out: [abpoaGFA] + run: abpoa.cwl + buildGraph: + in: {inputGFA: induceGraph/abpoaGFA} + out: [odgiGraph] + run: odgi-build-from-xpoa-gfa.cwl + vizGraph: + in: + sparse_graph_index: buildGraph/odgiGraph + width: + default: 50000 + height: + default: 500 + path_per_row: + default: true + path_height: + default: 4 + out: [graph_image] + requirements: + ResourceRequirement: + ramMin: $(15 * 1024) + outdirMin: 10 + run: ../tools/odgi/odgi_viz.cwl + # odgi2rdf: + # in: {odgi: buildGraph/odgiGraph} + # out: [rdf] + # run: odgi_to_rdf.cwl + dups2metadata: + in: + metadata: metadata + dups: dedup_and_sort_by_quality_and_len/dups + out: [merged] + run: dups2metadata.cwl + # bin_paths: + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_bin.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # bin_width: bin_widths + # scatter: bin_width + # out: [ bins, pangenome_sequence ] + # index_paths: + # label: Create path index + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_pathindex.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # out: [ indexed_paths ] + # segment_components: + # label: Run component segmentation + # run: ../tools/graph-genome-segmentation/component_segmentation.cwl + # in: + # bins: bin_paths/bins + # cells_per_file: cells_per_file + # pangenome_sequence: + # source: bin_paths/pangenome_sequence + # valueFrom: $(self[0]) + # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index + # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index + # # regardless of bin_width, so we take the first pangenome_sequence as input for this step + # out: [ colinear_components ] diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index b623054..fa16809 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -15,6 +15,9 @@ inputs: type: int default: 100 doc: Cells per file on component_segmentation + reversed_sort: + type: string + default: "true" outputs: odgiGraph: type: File @@ -42,7 +45,7 @@ outputs: # outputSource: segment_components/colinear_components steps: dedup_and_sort_by_quality_and_len: - in: {readsFA: seqs} + in: {readsFA: seqs, reversed_sorting: reversed_sort} out: [sortedReadsFA, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: @@ -53,7 +56,7 @@ steps: buildGraph: in: {inputGFA: induceGraph/spoaGFA} out: [odgiGraph] - run: odgi-build-from-spoa-gfa.cwl + run: odgi-build-from-xpoa-gfa.cwl vizGraph: in: sparse_graph_index: buildGraph/odgiGraph diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index 26c20e8..6f71d92 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -1,3 +1,4 @@ + cwlVersion: v1.1 class: CommandLineTool hints: @@ -5,6 +6,9 @@ hints: coresMin: 1 ramMin: 3000 inputs: + reversed_sorting: + type: string + inputBinding: {position: 3} readsFA: type: File inputBinding: {position: 2} @@ -23,3 +27,4 @@ requirements: InlineJavascriptRequirement: {} ShellCommandRequirement: {} baseCommand: [python] + diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py index e836654..02ebf60 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -7,15 +7,17 @@ import os import sys import gzip -#import xxhash # Faster library +# import xxhash # Faster library import hashlib + def open_gzipsafe(path_file): if path_file.endswith('.gz'): - return gzip.open(path_file, 'rt') + return gzip.open(path_file, 'rt') else: return open(path_file) + path_fasta = sys.argv[1] hash_to_count_and_headers_dict = {} @@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f: header = fasta.strip('\n').split('\n')[0] sequence = ''.join(fasta.strip('\n').split('\n')[1:]) - ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library + # hash = xxhash.xxh64(sequence).hexdigest() # Faster library hash = hashlib.md5(sequence.encode('utf-8')).hexdigest() if hash not in hash_to_count_and_headers_dict: @@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f: header_to_seq_dict[header] = sequence seq_len = len(sequence) - header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len]) + header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len]) hash_to_count_and_headers_dict[hash][0] += 1 hash_to_count_and_headers_dict[hash][1].append(header) - with open('dups.txt', 'w') as fw: for count, header_list in hash_to_count_and_headers_dict.values(): fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n') -for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): +reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False + +for header, percCalledBases, seqLength_list in sorted( + header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting +): sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) |