aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate/relabel-seqs.py
diff options
context:
space:
mode:
authorPjotr Prins2020-05-27 07:10:05 -0500
committerPjotr Prins2020-05-27 07:10:05 -0500
commit20da90784ad30b51549407dd4f390254c415ee6b (patch)
treee69bab9f04866704dc9e5c30f3720da3c2f9c676 /workflows/pangenome-generate/relabel-seqs.py
parentf2d44157283cccffb60b9b2716f8568ef67fb4f5 (diff)
parent7a96d0b1b15ab28fe3a618db35364891ab5d0328 (diff)
downloadbh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.tar.gz
bh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.tar.lz
bh20-seq-resource-20da90784ad30b51549407dd4f390254c415ee6b.zip
Merge branch 'master' of github.com:arvados/bh20-seq-resource
Diffstat (limited to 'workflows/pangenome-generate/relabel-seqs.py')
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py22
1 files changed, 17 insertions, 5 deletions
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 6b022a0..25b4a08 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,6 @@
import os
import json
+import sys
def readitems(stem):
items = []
@@ -16,15 +17,26 @@ subjects = readitems("subs")
relabeled_fasta = open("relabeledSeqs.fasta", "wt")
original_labels = open("originalLabels.ttl", "wt")
+blacklist = set()
+if len(sys.argv) > 1:
+ with open(sys.argv[1]) as bl:
+ for l in bl:
+ blacklist.add(l.strip())
+
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
- label = fa.readline()
- original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
- relabeled_fasta.write(">"+subjects[i]+"\n")
+ label = fa.readline().strip()
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"')))
+ skip = (subjects[i] in blacklist or label[1:] in blacklist)
+ if skip:
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % (subjects[i]))
+ if not skip:
+ relabeled_fasta.write(">"+subjects[i]+"\n")
data = fa.read(8096)
while data:
- relabeled_fasta.write(data)
+ if not skip:
+ relabeled_fasta.write(data)
endswithnewline = data.endswith("\n")
data = fa.read(8096)
- if not endswithnewline:
+ if not skip and not endswithnewline:
relabeled_fasta.write("\n")