aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate
diff options
context:
space:
mode:
authorPeter Amstutz2020-05-26 17:30:30 -0400
committerPeter Amstutz2020-05-26 18:12:23 -0400
commit7a96d0b1b15ab28fe3a618db35364891ab5d0328 (patch)
tree570532d2cc4c490175a4042d7bfabaad5120312d /workflows/pangenome-generate
parent30f3f8b0e9efbc954518fc8ea621b53c9591c83a (diff)
downloadbh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.tar.gz
bh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.tar.lz
bh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.zip
Can have list of sequence labels to exclude from combined fasta
refs #68 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'workflows/pangenome-generate')
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl2
-rw-r--r--workflows/pangenome-generate/relabel-seqs.cwl5
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py22
3 files changed, 23 insertions, 6 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index ad8b27f..9118cf8 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -9,6 +9,7 @@ inputs:
metadata: File[]
metadataSchema: File
subjects: string[]
+ exclude: File?
bin_widths:
type: int[]
default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
@@ -47,6 +48,7 @@ steps:
in:
readsFA: inputReads
subjects: subjects
+ exclude: exclude
out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
dedup:
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index c1f17a4..367b9bf 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -3,10 +3,13 @@ class: CommandLineTool
inputs:
readsFA: File[]
subjects: string[]
+ exclude:
+ type: File?
+ inputBinding: {position: 2}
script:
type: File
default: {class: File, location: relabel-seqs.py}
- inputBinding: {}
+ inputBinding: {position: 1}
outputs:
relabeledSeqs:
type: File
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 6b022a0..25b4a08 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,6 @@
import os
import json
+import sys
def readitems(stem):
items = []
@@ -16,15 +17,26 @@ subjects = readitems("subs")
relabeled_fasta = open("relabeledSeqs.fasta", "wt")
original_labels = open("originalLabels.ttl", "wt")
+blacklist = set()
+if len(sys.argv) > 1:
+ with open(sys.argv[1]) as bl:
+ for l in bl:
+ blacklist.add(l.strip())
+
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
- label = fa.readline()
- original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
- relabeled_fasta.write(">"+subjects[i]+"\n")
+ label = fa.readline().strip()
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"')))
+ skip = (subjects[i] in blacklist or label[1:] in blacklist)
+ if skip:
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % (subjects[i]))
+ if not skip:
+ relabeled_fasta.write(">"+subjects[i]+"\n")
data = fa.read(8096)
while data:
- relabeled_fasta.write(data)
+ if not skip:
+ relabeled_fasta.write(data)
endswithnewline = data.endswith("\n")
data = fa.read(8096)
- if not endswithnewline:
+ if not skip and not endswithnewline:
relabeled_fasta.write("\n")