aboutsummaryrefslogtreecommitdiff
path: root/workflows
diff options
context:
space:
mode:
authorPjotr Prins2020-05-27 07:10:05 -0500
committerPjotr Prins2020-05-27 07:10:05 -0500
commit20da90784ad30b51549407dd4f390254c415ee6b (patch)
treee69bab9f04866704dc9e5c30f3720da3c2f9c676 /workflows
parentf2d44157283cccffb60b9b2716f8568ef67fb4f5 (diff)
parent7a96d0b1b15ab28fe3a618db35364891ab5d0328 (diff)
downloadbh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.tar.gz
bh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.tar.lz
bh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.zip
Merge branch 'master' of github.com:arvados/bh20-seq-resource
Diffstat (limited to 'workflows')
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl2
-rw-r--r--workflows/pangenome-generate/relabel-seqs.cwl5
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py22
3 files changed, 23 insertions, 6 deletions
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index ad8b27f..9118cf8 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -9,6 +9,7 @@ inputs:
metadata: File[]
metadataSchema: File
subjects: string[]
+ exclude: File?
bin_widths:
type: int[]
default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
@@ -47,6 +48,7 @@ steps:
in:
readsFA: inputReads
subjects: subjects
+ exclude: exclude
out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
dedup:
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index c1f17a4..367b9bf 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -3,10 +3,13 @@ class: CommandLineTool
inputs:
readsFA: File[]
subjects: string[]
+ exclude:
+ type: File?
+ inputBinding: {position: 2}
script:
type: File
default: {class: File, location: relabel-seqs.py}
- inputBinding: {}
+ inputBinding: {position: 1}
outputs:
relabeledSeqs:
type: File
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 6b022a0..25b4a08 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,6 @@
import os
import json
+import sys
def readitems(stem):
items = []
@@ -16,15 +17,26 @@ subjects = readitems("subs")
relabeled_fasta = open("relabeledSeqs.fasta", "wt")
original_labels = open("originalLabels.ttl", "wt")
+blacklist = set()
+if len(sys.argv) > 1:
+ with open(sys.argv[1]) as bl:
+ for l in bl:
+ blacklist.add(l.strip())
+
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
- label = fa.readline()
- original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
- relabeled_fasta.write(">"+subjects[i]+"\n")
+ label = fa.readline().strip()
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"')))
+ skip = (subjects[i] in blacklist or label[1:] in blacklist)
+ if skip:
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % (subjects[i]))
+ if not skip:
+ relabeled_fasta.write(">"+subjects[i]+"\n")
data = fa.read(8096)
while data:
- relabeled_fasta.write(data)
+ if not skip:
+ relabeled_fasta.write(data)
endswithnewline = data.endswith("\n")
data = fa.read(8096)
- if not endswithnewline:
+ if not skip and not endswithnewline:
relabeled_fasta.write("\n")