about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl12
-rw-r--r--workflows/pangenome-generate/relabel-seqs.cwl19
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py13
-rw-r--r--workflows/pangenome-generate/seqkit-rmdup.cwl5
4 files changed, 48 insertions, 1 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index 2710743..896f936 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -22,8 +22,18 @@ outputs:
     type: File
     outputSource: mergeMetadata/merged
 steps:
+  relabel:
+    in:
+      readsFA: inputReads
+      subjects: subjects
+    out: [relabeledSeqs]
+    run: relabel-seqs.cwl
+  common:
+    in: {readsFA: relabel/relabeledSeqs}
+    out: [duplicatedReads]
+    run: seqkit-common.cwl
   dedup:
-    in: {readsFA: inputReads}
+    in: {readsFA: relabel/relabeledSeqs}
     out: [readsMergeDedup]
     run: seqkit-rmdup.cwl
   overlapReads:
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
new file mode 100644
index 0000000..b5b7231
--- /dev/null
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -0,0 +1,19 @@
+cwlVersion: v1.1
+class: CommandLineTool
+inputs:
+  readsFA: File[]
+  subjects: string[]
+outputs:
+  relabeledSeqs:
+    type: stdout
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement:
+    listing:
+      - entry: {$include: relabel-seqs.py}
+        entryname: relabel-seqs.py
+hints:
+  DockerRequirement:
+    dockerPull: commonworkflowlanguage/cwltool_module
+stdout: relabeledSeqs.fasta
+baseCommand: [python, relabel-seqs.py]
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
new file mode 100644
index 0000000..32f2386
--- /dev/null
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -0,0 +1,13 @@
+import sys
+
+reads = $(inputs.readsFA)
+subjects = $(inputs.subjects)
+
+for i, r in enumerate(reads):
+    with open(r["path"], "rt") as fa:
+        fa.readline()
+        print(">"+subjects[i])
+        data = fa.read(8096)
+        while data:
+            sys.stdout.write(data)
+            data = fa.read(8096)
diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl
index d3626f5..07184c3 100644
--- a/workflows/pangenome-generate/seqkit-rmdup.cwl
+++ b/workflows/pangenome-generate/seqkit-rmdup.cwl
@@ -7,6 +7,10 @@ outputs:
     type: File
     outputBinding:
       glob: readsMergeDedup.fasta
+  dups:
+    type: File
+    outputBinding:
+      glob: dups.txt
 requirements:
   InlineJavascriptRequirement: {}
 hints:
@@ -28,5 +32,6 @@ baseCommand: seqkit
 arguments: [rmdup,
             --by-seq,
             --ignore-case,
+            --dup-num-file, dups.txt,
             -o, readsMergeDedup.fasta,
             $(inputs.readsFA)]