aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-05-26 17:30:30 -0400
committerPeter Amstutz2020-05-26 18:12:23 -0400
commit7a96d0b1b15ab28fe3a618db35364891ab5d0328 (patch)
tree570532d2cc4c490175a4042d7bfabaad5120312d
parent30f3f8b0e9efbc954518fc8ea621b53c9591c83a (diff)
downloadbh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.tar.gz
bh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.tar.lz
bh20-seq-resource-7a96d0b1b15ab28fe3a618db35364891ab5d0328.zip
Can have list of sequence labels to exclude from combined fasta
refs #68 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
-rw-r--r--bh20seqanalyzer/main.py15
-rw-r--r--workflows/pangenome-generate/pangenome-generate.cwl2
-rw-r--r--workflows/pangenome-generate/relabel-seqs.cwl5
-rw-r--r--workflows/pangenome-generate/relabel-seqs.py22
4 files changed, 35 insertions, 9 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 28b5e31..31ad4c4 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -125,7 +125,8 @@ def start_pangenome_analysis(api,
analysis_project,
pangenome_workflow_uuid,
validated_project,
- schema_ref):
+ schema_ref,
+ exclude_list):
validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]])
inputobj = {
"inputReads": [],
@@ -134,6 +135,10 @@ def start_pangenome_analysis(api,
"metadataSchema": {
"class": "File",
"location": schema_ref
+ },
+ "exclude": {
+ "class": "File",
+ "location": exclude_list
}
}
validated.sort(key=lambda v: v["portable_data_hash"])
@@ -213,6 +218,8 @@ def main():
parser.add_argument('--pangenome-workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='')
parser.add_argument('--fastq-workflow-uuid', type=str, default='lugli-7fd4e-2zp9q4jo5xpif9y', help='')
+ parser.add_argument('--exclude-list', type=str, default='keep:lugli-4zz18-tzzhcm6hrf8ci8d/exclude.txt', help='')
+
parser.add_argument('--latest-result-collection', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='')
parser.add_argument('--kickoff', action="store_true")
args = parser.parse_args()
@@ -229,7 +236,8 @@ def main():
args.pangenome_analysis_project,
args.pangenome_workflow_uuid,
args.validated_project,
- schema_ref)
+ schema_ref,
+ args.exclude_list)
return
logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project))
@@ -250,7 +258,8 @@ def main():
args.pangenome_analysis_project,
args.pangenome_workflow_uuid,
args.validated_project,
- schema_ref)
+ schema_ref,
+ args.exclude_list)
copy_most_recent_result(api,
args.pangenome_analysis_project,
diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl
index ad8b27f..9118cf8 100644
--- a/workflows/pangenome-generate/pangenome-generate.cwl
+++ b/workflows/pangenome-generate/pangenome-generate.cwl
@@ -9,6 +9,7 @@ inputs:
metadata: File[]
metadataSchema: File
subjects: string[]
+ exclude: File?
bin_widths:
type: int[]
default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
@@ -47,6 +48,7 @@ steps:
in:
readsFA: inputReads
subjects: subjects
+ exclude: exclude
out: [relabeledSeqs, originalLabels]
run: relabel-seqs.cwl
dedup:
diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl
index c1f17a4..367b9bf 100644
--- a/workflows/pangenome-generate/relabel-seqs.cwl
+++ b/workflows/pangenome-generate/relabel-seqs.cwl
@@ -3,10 +3,13 @@ class: CommandLineTool
inputs:
readsFA: File[]
subjects: string[]
+ exclude:
+ type: File?
+ inputBinding: {position: 2}
script:
type: File
default: {class: File, location: relabel-seqs.py}
- inputBinding: {}
+ inputBinding: {position: 1}
outputs:
relabeledSeqs:
type: File
diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py
index 6b022a0..25b4a08 100644
--- a/workflows/pangenome-generate/relabel-seqs.py
+++ b/workflows/pangenome-generate/relabel-seqs.py
@@ -1,5 +1,6 @@
import os
import json
+import sys
def readitems(stem):
items = []
@@ -16,15 +17,26 @@ subjects = readitems("subs")
relabeled_fasta = open("relabeledSeqs.fasta", "wt")
original_labels = open("originalLabels.ttl", "wt")
+blacklist = set()
+if len(sys.argv) > 1:
+ with open(sys.argv[1]) as bl:
+ for l in bl:
+ blacklist.add(l.strip())
+
for i, r in enumerate(reads):
with open(r["path"], "rt") as fa:
- label = fa.readline()
- original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"')))
- relabeled_fasta.write(">"+subjects[i]+"\n")
+ label = fa.readline().strip()
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"')))
+ skip = (subjects[i] in blacklist or label[1:] in blacklist)
+ if skip:
+ original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % (subjects[i]))
+ if not skip:
+ relabeled_fasta.write(">"+subjects[i]+"\n")
data = fa.read(8096)
while data:
- relabeled_fasta.write(data)
+ if not skip:
+ relabeled_fasta.write(data)
endswithnewline = data.endswith("\n")
data = fa.read(8096)
- if not endswithnewline:
+ if not skip and not endswithnewline:
relabeled_fasta.write("\n")