From 88d81f853cf04b7f28681dd9cdee775b0422f252 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 12:53:19 -0400 Subject: Working on NCBI import Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'bh20sequploader/main.py') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 49d012d..2fda347 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -44,7 +44,8 @@ def main(): with col.open(target, "w") as f: r = args.sequence.read(65536) - print(r[0:20]) + seqlabel = r[1:r.index("\n")] + print(seqlabel) while r: f.write(r) r = args.sequence.read(65536) @@ -67,8 +68,8 @@ def main(): "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) } - col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" % - (properties['upload_user'], properties['upload_ip']), + col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" % + (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) print("Done") -- cgit v1.2.3 From f4c3da88c1233802fea46cc972a81dc3b5b51185 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 21 Apr 2020 15:37:58 -0400 Subject: Work around CWL content size limit by chunking Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/main.py | 1 + workflows/pangenome-generate/relabel-seqs.cwl | 31 +++++++++++++++++++++++---- workflows/pangenome-generate/relabel-seqs.py | 22 +++++++++++++------ 3 files changed, 44 insertions(+), 10 deletions(-) (limited to 'bh20sequploader/main.py') diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 2fda347..4c4711d 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -63,6 +63,7 @@ def main(): external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') properties = { + "sequence_label": seqlabel, "upload_app": "bh20-seq-uploader", "upload_ip": external_ip, "upload_user": "%s@%s" % (getpass.getuser(), socket.gethostname()) diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl index 2b780d4..01196f6 100644 --- a/workflows/pangenome-generate/relabel-seqs.cwl +++ b/workflows/pangenome-generate/relabel-seqs.cwl @@ -3,6 +3,10 @@ class: CommandLineTool inputs: readsFA: File[] subjects: string[] + script: + type: File + default: {class: File, location: relabel-seqs.py} + inputBinding: {} outputs: relabeledSeqs: type: File @@ -15,11 +19,30 @@ outputs: requirements: InlineJavascriptRequirement: {} InitialWorkDirRequirement: - listing: - - entry: {$include: relabel-seqs.py} - entryname: relabel-seqs.py + listing: | + ${ + var i = 0; + var b = 1; + var out = []; + for (; i < inputs.readsFA.length; i++) { + var block = []; + for (; i < (b*100) && i < inputs.readsFA.length; i++) { + block.push(inputs.readsFA[i]); + } + out.push({ + entryname: "block"+b, + entry: JSON.stringify(block) + }); + b++; + } + out.push({ + entry: JSON.stringify(inputs.subjects), + entryname: "subjects" + }); + return out; + } hints: DockerRequirement: dockerPull: commonworkflowlanguage/cwltool_module stdout: -baseCommand: [python, relabel-seqs.py] +baseCommand: [python] diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py index 1188ceb..970540f 100644 --- a/workflows/pangenome-generate/relabel-seqs.py +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -1,5 +1,15 @@ -reads = $(inputs.readsFA) -subjects = $(inputs.subjects) +import os +import json + +reads = [] +b = 1 +while os.path.exists("block%i" % b): + with open("block%i" % b) as f: + reads.extend(json.load(f)) + b += 1 + +with open("subjects") as f: + subjects = json.load(f) relabeled_fasta = open("relabeledSeqs.fasta", "wt") original_labels = open("originalLabels.ttl", "wt") @@ -7,12 +17,12 @@ original_labels = open("originalLabels.ttl", "wt") for i, r in enumerate(reads): with open(r["path"], "rt") as fa: label = fa.readline() - original_labels.write("<%s> \\"%s\\" .\\n" % (subjects[i], label[1:].strip().replace('"', '\\\\"'))) - relabeled_fasta.write(">"+subjects[i]+"\\n") + original_labels.write("<%s> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"'))) + relabeled_fasta.write(">"+subjects[i]+"\n") data = fa.read(8096) while data: relabeled_fasta.write(data) - endswithnewline = data.endswith("\\n") + endswithnewline = data.endswith("\n") data = fa.read(8096) if not endswithnewline: - relabeled_fasta.write("\\n") + relabeled_fasta.write("\n") -- cgit v1.2.3