From 9ddcfeacb3191638f42b08af999889d867f0f81c Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 20 Apr 2020 14:57:25 -0400 Subject: Better handling of duplicate sequences Also save original fasta label in metadata --- workflows/pangenome-generate/relabel-seqs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'workflows/pangenome-generate/relabel-seqs.py') diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py index 32f2386..b558fe2 100644 --- a/workflows/pangenome-generate/relabel-seqs.py +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -1,13 +1,15 @@ -import sys - reads = $(inputs.readsFA) subjects = $(inputs.subjects) +relabeled_fasta = open("relabeledSeqs.fasta", "wt") +original_labels = open("originalLabels.ttl", "wt") + for i, r in enumerate(reads): with open(r["path"], "rt") as fa: - fa.readline() - print(">"+subjects[i]) + label = fa.readline() + original_labels.write("<%s> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"'))) + relabeled_fasta.write(">"+subjects[i]+"\\n") data = fa.read(8096) while data: - sys.stdout.write(data) + relabeled_fasta.write(data) data = fa.read(8096) -- cgit v1.2.3