From 3c35250eb6394ec8d0e28e4391ec23f2223345c8 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 21 Nov 2020 21:21:36 +0100 Subject: generalized spoa workflow --- .../pangenome-generate/pangenome-generate_spoa.cwl | 2 +- .../sort_fasta_by_quality_and_len.cwl | 2 ++ .../pangenome-generate/sort_fasta_by_quality_and_len.py | 17 +++++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index b623054..a46524a 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -42,7 +42,7 @@ outputs: # outputSource: segment_components/colinear_components steps: dedup_and_sort_by_quality_and_len: - in: {readsFA: seqs} + in: {readsFA: seqs, reversed_sorting: 'true'} out: [sortedReadsFA, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index 26c20e8..f8df786 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -5,6 +5,8 @@ hints: coresMin: 1 ramMin: 3000 inputs: + reversed_sorting: + inputBinding: {position: 3} readsFA: type: File inputBinding: {position: 2} diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py index e836654..02ebf60 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py @@ -7,15 +7,17 @@ import os import sys import gzip -#import xxhash # Faster library +# import xxhash # Faster library import hashlib + def open_gzipsafe(path_file): if path_file.endswith('.gz'): - return gzip.open(path_file, 'rt') + return gzip.open(path_file, 'rt') else: return open(path_file) + path_fasta = sys.argv[1] hash_to_count_and_headers_dict = {} @@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f: header = fasta.strip('\n').split('\n')[0] sequence = ''.join(fasta.strip('\n').split('\n')[1:]) - ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library + # hash = xxhash.xxh64(sequence).hexdigest() # Faster library hash = hashlib.md5(sequence.encode('utf-8')).hexdigest() if hash not in hash_to_count_and_headers_dict: @@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f: header_to_seq_dict[header] = sequence seq_len = len(sequence) - header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len]) + header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len]) hash_to_count_and_headers_dict[hash][0] += 1 hash_to_count_and_headers_dict[hash][1].append(header) - with open('dups.txt', 'w') as fw: for count, header_list in hash_to_count_and_headers_dict.values(): fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n') -for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True): +reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False + +for header, percCalledBases, seqLength_list in sorted( + header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting +): sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) -- cgit v1.2.3 From 862ae0235e3d1b105fa342cedf4aa59ae73d90c4 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 21 Nov 2020 22:00:34 +0100 Subject: added reversed_sorting parameter; typos --- workflows/pangenome-generate/pangenome-generate_spoa.cwl | 5 ++++- workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index a46524a..4e7302d 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -15,6 +15,9 @@ inputs: type: int default: 100 doc: Cells per file on component_segmentation + reversed_sort: + type: string + default: "true" outputs: odgiGraph: type: File @@ -42,7 +45,7 @@ outputs: # outputSource: segment_components/colinear_components steps: dedup_and_sort_by_quality_and_len: - in: {readsFA: seqs, reversed_sorting: 'true'} + in: {readsFA: seqs, reversed_sorting: reversed_sort} out: [sortedReadsFA, dups] run: sort_fasta_by_quality_and_len.cwl induceGraph: diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl index f8df786..6f71d92 100644 --- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl +++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl @@ -1,3 +1,4 @@ + cwlVersion: v1.1 class: CommandLineTool hints: @@ -5,7 +6,8 @@ hints: coresMin: 1 ramMin: 3000 inputs: - reversed_sorting: + reversed_sorting: + type: string inputBinding: {position: 3} readsFA: type: File @@ -25,3 +27,4 @@ requirements: InlineJavascriptRequirement: {} ShellCommandRequirement: {} baseCommand: [python] + -- cgit v1.2.3 From 1e9a735e84a8e89099b1de2ec66c56669caf6cc8 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 21 Nov 2020 23:06:05 +0100 Subject: added abPOA workflow; typos --- scripts/create_sra_metadata/create_sra_metadata.py | 2 +- workflows/pangenome-generate/abpoa.cwl | 26 +++++ .../odgi-build-from-spoa-gfa.cwl | 29 ----- .../odgi-build-from-xpoa-gfa.cwl | 29 +++++ .../pangenome-generate_abpoa.cwl | 120 +++++++++++++++++++++ .../pangenome-generate/pangenome-generate_spoa.cwl | 2 +- 6 files changed, 177 insertions(+), 31 deletions(-) create mode 100644 workflows/pangenome-generate/abpoa.cwl delete mode 100644 workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl create mode 100644 workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl create mode 100644 workflows/pangenome-generate/pangenome-generate_abpoa.cwl diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index eebd9aa..35f61a5 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -232,7 +232,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_" + taxon_id - # This script download and prepare data and metadata for samples that will be mapepd againg a referenceT + # This script download and prepare data and metadata for samples that will be mapped against a referenceT info_for_yaml_dict['technology']['assembly_method'] = 'http://purl.obolibrary.org/obo/GENEPIO_0002028' EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT') diff --git a/workflows/pangenome-generate/abpoa.cwl b/workflows/pangenome-generate/abpoa.cwl new file mode 100644 index 0000000..fa9f157 --- /dev/null +++ b/workflows/pangenome-generate/abpoa.cwl @@ -0,0 +1,26 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: File + script: + type: File + default: {class: File, location: relabel-seqs.py} +outputs: + abpoaGFA: + type: stdout +requirements: + InlineJavascriptRequirement: {} +hints: + DockerRequirement: + dockerPull: "quay.io/biocontainers/abpoa:1.0.5--hed695b0_0" + ResourceRequirement: + coresMin: 1 + ramMin: $(15 * 1024) + outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) +baseCommand: abpoa +stdout: $(inputs.readsFA.nameroot).O0.gfa +arguments: [ + $(inputs.readsFA), + -r 3, + -O, '0' +] diff --git a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl deleted file mode 100644 index eee4031..0000000 --- a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl +++ /dev/null @@ -1,29 +0,0 @@ -cwlVersion: v1.1 -class: CommandLineTool -inputs: - inputGFA: File -outputs: - odgiGraph: - type: File - outputBinding: - glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi -requirements: - InlineJavascriptRequirement: {} -hints: - DockerRequirement: - dockerPull: "odgi-bash-binutils:latest" - ResourceRequirement: - coresMin: 4 - ramMin: $(15 * 1024) - outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2)) - InitialWorkDirRequirement: - # Will fail if input file is not writable (odgi bug) - listing: - - entry: $(inputs.inputGFA) - writable: true -arguments: - - "sh" - - "-c" - - >- - odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - | - odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi diff --git a/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl new file mode 100644 index 0000000..eee4031 --- /dev/null +++ b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl @@ -0,0 +1,29 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + inputGFA: File +outputs: + odgiGraph: + type: File + outputBinding: + glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi +requirements: + InlineJavascriptRequirement: {} +hints: + DockerRequirement: + dockerPull: "odgi-bash-binutils:latest" + ResourceRequirement: + coresMin: 4 + ramMin: $(15 * 1024) + outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2)) + InitialWorkDirRequirement: + # Will fail if input file is not writable (odgi bug) + listing: + - entry: $(inputs.inputGFA) + writable: true +arguments: + - "sh" + - "-c" + - >- + odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - | + odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi diff --git a/workflows/pangenome-generate/pangenome-generate_abpoa.cwl b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl new file mode 100644 index 0000000..8f98dc6 --- /dev/null +++ b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl @@ -0,0 +1,120 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.1 +class: Workflow +requirements: + ScatterFeatureRequirement: {} + StepInputExpressionRequirement: {} +inputs: + seqs: File + metadata: File + bin_widths: + type: int[] + default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] + doc: width of each bin in basepairs along the graph vector + cells_per_file: + type: int + default: 100 + doc: Cells per file on component_segmentation + reversed_sort: + type: string + default: "true" +outputs: + odgiGraph: + type: File + outputSource: buildGraph/odgiGraph + odgiPNG: + type: File + outputSource: vizGraph/graph_image + abpoaGFA: + type: File + outputSource: induceGraph/abpoaGFA +# odgiRDF: +# type: File +# outputSource: odgi2rdf/rdf + readsMergeDedupSortedByQualAndLen: + type: File + outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA + mergedMetadata: + type: File + outputSource: dups2metadata/merged +# indexed_paths: +# type: File +# outputSource: index_paths/indexed_paths +# colinear_components: +# type: Directory +# outputSource: segment_components/colinear_components +steps: + dedup_and_sort_by_quality_and_len: + in: {readsFA: seqs, reversed_sorting: reversed_sort} + out: [sortedReadsFA, dups] + run: sort_fasta_by_quality_and_len.cwl + induceGraph: + in: + readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA + out: [abpoaGFA] + run: abpoa.cwl + buildGraph: + in: {inputGFA: induceGraph/abpoaGFA} + out: [odgiGraph] + run: odgi-build-from-xpoa-gfa.cwl + vizGraph: + in: + sparse_graph_index: buildGraph/odgiGraph + width: + default: 50000 + height: + default: 500 + path_per_row: + default: true + path_height: + default: 4 + out: [graph_image] + requirements: + ResourceRequirement: + ramMin: $(15 * 1024) + outdirMin: 10 + run: ../tools/odgi/odgi_viz.cwl + # odgi2rdf: + # in: {odgi: buildGraph/odgiGraph} + # out: [rdf] + # run: odgi_to_rdf.cwl + dups2metadata: + in: + metadata: metadata + dups: dedup_and_sort_by_quality_and_len/dups + out: [merged] + run: dups2metadata.cwl + # bin_paths: + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_bin.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # bin_width: bin_widths + # scatter: bin_width + # out: [ bins, pangenome_sequence ] + # index_paths: + # label: Create path index + # requirements: + # ResourceRequirement: + # ramMin: 3000 + # outdirMin: 10 + # run: ../tools/odgi/odgi_pathindex.cwl + # in: + # sparse_graph_index: buildGraph/odgiGraph + # out: [ indexed_paths ] + # segment_components: + # label: Run component segmentation + # run: ../tools/graph-genome-segmentation/component_segmentation.cwl + # in: + # bins: bin_paths/bins + # cells_per_file: cells_per_file + # pangenome_sequence: + # source: bin_paths/pangenome_sequence + # valueFrom: $(self[0]) + # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index + # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index + # # regardless of bin_width, so we take the first pangenome_sequence as input for this step + # out: [ colinear_components ] diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl index 4e7302d..fa16809 100644 --- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl @@ -56,7 +56,7 @@ steps: buildGraph: in: {inputGFA: induceGraph/spoaGFA} out: [odgiGraph] - run: odgi-build-from-spoa-gfa.cwl + run: odgi-build-from-xpoa-gfa.cwl vizGraph: in: sparse_graph_index: buildGraph/odgiGraph -- cgit v1.2.3 From bfd830f9d777c456409958030142155043ec1c68 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sat, 21 Nov 2020 23:11:40 +0100 Subject: abPOA works better starting from shorter sequences --- workflows/pangenome-generate/pangenome-generate_abpoa.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/pangenome-generate/pangenome-generate_abpoa.cwl b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl index 8f98dc6..c61e51a 100644 --- a/workflows/pangenome-generate/pangenome-generate_abpoa.cwl +++ b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl @@ -17,7 +17,7 @@ inputs: doc: Cells per file on component_segmentation reversed_sort: type: string - default: "true" + default: "false" outputs: odgiGraph: type: File -- cgit v1.2.3