diff options
author | Peter Amstutz | 2020-04-20 13:41:56 -0400 |
---|---|---|
committer | Peter Amstutz | 2020-04-20 13:41:56 -0400 |
commit | d29dfd593233541b85c1cefb239650279d57d59f (patch) | |
tree | 51fa615a05a2606021bdf8c537ad8b36ae4e0a8c /workflows | |
parent | d781e42c9adac07253cb928ae66e9b7314710267 (diff) | |
download | bh20-seq-resource-d29dfd593233541b85c1cefb239650279d57d59f.tar.gz bh20-seq-resource-d29dfd593233541b85c1cefb239650279d57d59f.tar.lz bh20-seq-resource-d29dfd593233541b85c1cefb239650279d57d59f.zip |
Relabel sequences to match metadata subjects.
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'workflows')
-rw-r--r-- | workflows/pangenome-generate/pangenome-generate.cwl | 12 | ||||
-rw-r--r-- | workflows/pangenome-generate/relabel-seqs.cwl | 19 | ||||
-rw-r--r-- | workflows/pangenome-generate/relabel-seqs.py | 13 | ||||
-rw-r--r-- | workflows/pangenome-generate/seqkit-rmdup.cwl | 5 |
4 files changed, 48 insertions, 1 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl index 2710743..896f936 100644 --- a/workflows/pangenome-generate/pangenome-generate.cwl +++ b/workflows/pangenome-generate/pangenome-generate.cwl @@ -22,8 +22,18 @@ outputs: type: File outputSource: mergeMetadata/merged steps: + relabel: + in: + readsFA: inputReads + subjects: subjects + out: [relabeledSeqs] + run: relabel-seqs.cwl + common: + in: {readsFA: relabel/relabeledSeqs} + out: [duplicatedReads] + run: seqkit-common.cwl dedup: - in: {readsFA: inputReads} + in: {readsFA: relabel/relabeledSeqs} out: [readsMergeDedup] run: seqkit-rmdup.cwl overlapReads: diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl new file mode 100644 index 0000000..b5b7231 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.cwl @@ -0,0 +1,19 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: File[] + subjects: string[] +outputs: + relabeledSeqs: + type: stdout +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: + - entry: {$include: relabel-seqs.py} + entryname: relabel-seqs.py +hints: + DockerRequirement: + dockerPull: commonworkflowlanguage/cwltool_module +stdout: relabeledSeqs.fasta +baseCommand: [python, relabel-seqs.py] diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py new file mode 100644 index 0000000..32f2386 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -0,0 +1,13 @@ +import sys + +reads = $(inputs.readsFA) +subjects = $(inputs.subjects) + +for i, r in enumerate(reads): + with open(r["path"], "rt") as fa: + fa.readline() + print(">"+subjects[i]) + data = fa.read(8096) + while data: + sys.stdout.write(data) + data = fa.read(8096) diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl index d3626f5..07184c3 100644 --- a/workflows/pangenome-generate/seqkit-rmdup.cwl +++ b/workflows/pangenome-generate/seqkit-rmdup.cwl @@ -7,6 +7,10 @@ outputs: type: File outputBinding: glob: readsMergeDedup.fasta + dups: + type: File + outputBinding: + glob: dups.txt requirements: InlineJavascriptRequirement: {} hints: @@ -28,5 +32,6 @@ baseCommand: seqkit arguments: [rmdup, --by-seq, --ignore-case, + --dup-num-file, dups.txt, -o, readsMergeDedup.fasta, $(inputs.readsFA)] |