From d29dfd593233541b85c1cefb239650279d57d59f Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 20 Apr 2020 13:41:56 -0400 Subject: Relabel sequences to match metadata subjects. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/pangenome-generate.cwl | 12 +++++++++++- workflows/pangenome-generate/relabel-seqs.cwl | 19 +++++++++++++++++++ workflows/pangenome-generate/relabel-seqs.py | 13 +++++++++++++ workflows/pangenome-generate/seqkit-rmdup.cwl | 5 +++++ 4 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 workflows/pangenome-generate/relabel-seqs.cwl create mode 100644 workflows/pangenome-generate/relabel-seqs.py (limited to 'workflows/pangenome-generate') diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl index 2710743..896f936 100644 --- a/workflows/pangenome-generate/pangenome-generate.cwl +++ b/workflows/pangenome-generate/pangenome-generate.cwl @@ -22,8 +22,18 @@ outputs: type: File outputSource: mergeMetadata/merged steps: + relabel: + in: + readsFA: inputReads + subjects: subjects + out: [relabeledSeqs] + run: relabel-seqs.cwl + common: + in: {readsFA: relabel/relabeledSeqs} + out: [duplicatedReads] + run: seqkit-common.cwl dedup: - in: {readsFA: inputReads} + in: {readsFA: relabel/relabeledSeqs} out: [readsMergeDedup] run: seqkit-rmdup.cwl overlapReads: diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl new file mode 100644 index 0000000..b5b7231 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.cwl @@ -0,0 +1,19 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: File[] + subjects: string[] +outputs: + relabeledSeqs: + type: stdout +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: + - entry: {$include: relabel-seqs.py} + entryname: relabel-seqs.py +hints: + DockerRequirement: + dockerPull: commonworkflowlanguage/cwltool_module +stdout: relabeledSeqs.fasta +baseCommand: [python, relabel-seqs.py] diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py new file mode 100644 index 0000000..32f2386 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -0,0 +1,13 @@ +import sys + +reads = $(inputs.readsFA) +subjects = $(inputs.subjects) + +for i, r in enumerate(reads): + with open(r["path"], "rt") as fa: + fa.readline() + print(">"+subjects[i]) + data = fa.read(8096) + while data: + sys.stdout.write(data) + data = fa.read(8096) diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl index d3626f5..07184c3 100644 --- a/workflows/pangenome-generate/seqkit-rmdup.cwl +++ b/workflows/pangenome-generate/seqkit-rmdup.cwl @@ -7,6 +7,10 @@ outputs: type: File outputBinding: glob: readsMergeDedup.fasta + dups: + type: File + outputBinding: + glob: dups.txt requirements: InlineJavascriptRequirement: {} hints: @@ -28,5 +32,6 @@ baseCommand: seqkit arguments: [rmdup, --by-seq, --ignore-case, + --dup-num-file, dups.txt, -o, readsMergeDedup.fasta, $(inputs.readsFA)] -- cgit v1.2.3