aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-12-24 09:15:39 +0000
committerGitHub2020-12-24 09:15:39 +0000
commitdf0421757e464f07b5e96b5444f1926784e7400f (patch)
treef9a9a262bb2d95a89a6ec1c96b98a2b166fb92b5
parent00ba74b163f723bb7283624171f0c7c203dc99e5 (diff)
parentbfd830f9d777c456409958030142155043ec1c68 (diff)
downloadbh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.gz
bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.tar.lz
bh20-seq-resource-df0421757e464f07b5e96b5444f1926784e7400f.zip
Merge pull request #117 from arvados/pangenome_workflow_abpoa
Pangenome workflow with abPOA
-rw-r--r--scripts/create_sra_metadata/create_sra_metadata.py2
-rw-r--r--workflows/pangenome-generate/abpoa.cwl26
-rw-r--r--workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl (renamed from workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl)0
-rw-r--r--workflows/pangenome-generate/pangenome-generate_abpoa.cwl120
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl7
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl5
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.py17
7 files changed, 168 insertions, 9 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index eebd9aa..35f61a5 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -232,7 +232,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_" + taxon_id
- # This script download and prepare data and metadata for samples that will be mapepd againg a referenceT
+ # This script download and prepare data and metadata for samples that will be mapped against a referenceT
info_for_yaml_dict['technology']['assembly_method'] = 'http://purl.obolibrary.org/obo/GENEPIO_0002028'
EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
diff --git a/workflows/pangenome-generate/abpoa.cwl b/workflows/pangenome-generate/abpoa.cwl
new file mode 100644
index 0000000..fa9f157
--- /dev/null
+++ b/workflows/pangenome-generate/abpoa.cwl
@@ -0,0 +1,26 @@
+cwlVersion: v1.1
+class: CommandLineTool
+inputs:
+ readsFA: File
+ script:
+ type: File
+ default: {class: File, location: relabel-seqs.py}
+outputs:
+ abpoaGFA:
+ type: stdout
+requirements:
+ InlineJavascriptRequirement: {}
+hints:
+ DockerRequirement:
+ dockerPull: "quay.io/biocontainers/abpoa:1.0.5--hed695b0_0"
+ ResourceRequirement:
+ coresMin: 1
+ ramMin: $(15 * 1024)
+ outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
+baseCommand: abpoa
+stdout: $(inputs.readsFA.nameroot).O0.gfa
+arguments: [
+ $(inputs.readsFA),
+ -r 3,
+ -O, '0'
+]
diff --git a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl
index eee4031..eee4031 100644
--- a/workflows/pangenome-generate/odgi-build-from-spoa-gfa.cwl
+++ b/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl
diff --git a/workflows/pangenome-generate/pangenome-generate_abpoa.cwl b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl
new file mode 100644
index 0000000..c61e51a
--- /dev/null
+++ b/workflows/pangenome-generate/pangenome-generate_abpoa.cwl
@@ -0,0 +1,120 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.1
+class: Workflow
+requirements:
+ ScatterFeatureRequirement: {}
+ StepInputExpressionRequirement: {}
+inputs:
+ seqs: File
+ metadata: File
+ bin_widths:
+ type: int[]
+ default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
+ doc: width of each bin in basepairs along the graph vector
+ cells_per_file:
+ type: int
+ default: 100
+ doc: Cells per file on component_segmentation
+ reversed_sort:
+ type: string
+ default: "false"
+outputs:
+ odgiGraph:
+ type: File
+ outputSource: buildGraph/odgiGraph
+ odgiPNG:
+ type: File
+ outputSource: vizGraph/graph_image
+ abpoaGFA:
+ type: File
+ outputSource: induceGraph/abpoaGFA
+# odgiRDF:
+# type: File
+# outputSource: odgi2rdf/rdf
+ readsMergeDedupSortedByQualAndLen:
+ type: File
+ outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA
+ mergedMetadata:
+ type: File
+ outputSource: dups2metadata/merged
+# indexed_paths:
+# type: File
+# outputSource: index_paths/indexed_paths
+# colinear_components:
+# type: Directory
+# outputSource: segment_components/colinear_components
+steps:
+ dedup_and_sort_by_quality_and_len:
+ in: {readsFA: seqs, reversed_sorting: reversed_sort}
+ out: [sortedReadsFA, dups]
+ run: sort_fasta_by_quality_and_len.cwl
+ induceGraph:
+ in:
+ readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA
+ out: [abpoaGFA]
+ run: abpoa.cwl
+ buildGraph:
+ in: {inputGFA: induceGraph/abpoaGFA}
+ out: [odgiGraph]
+ run: odgi-build-from-xpoa-gfa.cwl
+ vizGraph:
+ in:
+ sparse_graph_index: buildGraph/odgiGraph
+ width:
+ default: 50000
+ height:
+ default: 500
+ path_per_row:
+ default: true
+ path_height:
+ default: 4
+ out: [graph_image]
+ requirements:
+ ResourceRequirement:
+ ramMin: $(15 * 1024)
+ outdirMin: 10
+ run: ../tools/odgi/odgi_viz.cwl
+ # odgi2rdf:
+ # in: {odgi: buildGraph/odgiGraph}
+ # out: [rdf]
+ # run: odgi_to_rdf.cwl
+ dups2metadata:
+ in:
+ metadata: metadata
+ dups: dedup_and_sort_by_quality_and_len/dups
+ out: [merged]
+ run: dups2metadata.cwl
+ # bin_paths:
+ # requirements:
+ # ResourceRequirement:
+ # ramMin: 3000
+ # outdirMin: 10
+ # run: ../tools/odgi/odgi_bin.cwl
+ # in:
+ # sparse_graph_index: buildGraph/odgiGraph
+ # bin_width: bin_widths
+ # scatter: bin_width
+ # out: [ bins, pangenome_sequence ]
+ # index_paths:
+ # label: Create path index
+ # requirements:
+ # ResourceRequirement:
+ # ramMin: 3000
+ # outdirMin: 10
+ # run: ../tools/odgi/odgi_pathindex.cwl
+ # in:
+ # sparse_graph_index: buildGraph/odgiGraph
+ # out: [ indexed_paths ]
+ # segment_components:
+ # label: Run component segmentation
+ # run: ../tools/graph-genome-segmentation/component_segmentation.cwl
+ # in:
+ # bins: bin_paths/bins
+ # cells_per_file: cells_per_file
+ # pangenome_sequence:
+ # source: bin_paths/pangenome_sequence
+ # valueFrom: $(self[0])
+ # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
+ # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
+ # # regardless of bin_width, so we take the first pangenome_sequence as input for this step
+ # out: [ colinear_components ]
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
index b623054..fa16809 100644
--- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -15,6 +15,9 @@ inputs:
type: int
default: 100
doc: Cells per file on component_segmentation
+ reversed_sort:
+ type: string
+ default: "true"
outputs:
odgiGraph:
type: File
@@ -42,7 +45,7 @@ outputs:
# outputSource: segment_components/colinear_components
steps:
dedup_and_sort_by_quality_and_len:
- in: {readsFA: seqs}
+ in: {readsFA: seqs, reversed_sorting: reversed_sort}
out: [sortedReadsFA, dups]
run: sort_fasta_by_quality_and_len.cwl
induceGraph:
@@ -53,7 +56,7 @@ steps:
buildGraph:
in: {inputGFA: induceGraph/spoaGFA}
out: [odgiGraph]
- run: odgi-build-from-spoa-gfa.cwl
+ run: odgi-build-from-xpoa-gfa.cwl
vizGraph:
in:
sparse_graph_index: buildGraph/odgiGraph
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
index 26c20e8..6f71d92 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
@@ -1,3 +1,4 @@
+
cwlVersion: v1.1
class: CommandLineTool
hints:
@@ -5,6 +6,9 @@ hints:
coresMin: 1
ramMin: 3000
inputs:
+ reversed_sorting:
+ type: string
+ inputBinding: {position: 3}
readsFA:
type: File
inputBinding: {position: 2}
@@ -23,3 +27,4 @@ requirements:
InlineJavascriptRequirement: {}
ShellCommandRequirement: {}
baseCommand: [python]
+
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
index e836654..02ebf60 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py
@@ -7,15 +7,17 @@ import os
import sys
import gzip
-#import xxhash # Faster library
+# import xxhash # Faster library
import hashlib
+
def open_gzipsafe(path_file):
if path_file.endswith('.gz'):
- return gzip.open(path_file, 'rt')
+ return gzip.open(path_file, 'rt')
else:
return open(path_file)
+
path_fasta = sys.argv[1]
hash_to_count_and_headers_dict = {}
@@ -28,7 +30,7 @@ with open_gzipsafe(path_fasta) as f:
header = fasta.strip('\n').split('\n')[0]
sequence = ''.join(fasta.strip('\n').split('\n')[1:])
- ##hash = xxhash.xxh64(sequence).hexdigest() # Faster library
+ # hash = xxhash.xxh64(sequence).hexdigest() # Faster library
hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
if hash not in hash_to_count_and_headers_dict:
@@ -38,15 +40,18 @@ with open_gzipsafe(path_fasta) as f:
header_to_seq_dict[header] = sequence
seq_len = len(sequence)
- header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N'))/seq_len, seq_len])
+ header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len])
hash_to_count_and_headers_dict[hash][0] += 1
hash_to_count_and_headers_dict[hash][1].append(header)
-
with open('dups.txt', 'w') as fw:
for count, header_list in hash_to_count_and_headers_dict.values():
fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
-for header, percCalledBases, seqLength_list in sorted(header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse = True):
+reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False
+
+for header, percCalledBases, seqLength_list in sorted(
+ header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting
+):
sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))