diff options
Diffstat (limited to 'workflows/pangenome-generate')
-rw-r--r-- | workflows/pangenome-generate/merge-metadata.cwl | 44 | ||||
-rw-r--r-- | workflows/pangenome-generate/merge-metadata.py | 40 | ||||
-rw-r--r-- | workflows/pangenome-generate/minimap2.cwl | 2 | ||||
-rw-r--r-- | workflows/pangenome-generate/pangenome-generate.cwl | 15 | ||||
-rw-r--r-- | workflows/pangenome-generate/relabel-seqs.cwl | 50 | ||||
-rw-r--r-- | workflows/pangenome-generate/relabel-seqs.py | 30 | ||||
-rw-r--r-- | workflows/pangenome-generate/seqkit-rmdup.cwl | 7 | ||||
-rw-r--r-- | workflows/pangenome-generate/testjob.yml | 16 |
8 files changed, 191 insertions, 13 deletions
diff --git a/workflows/pangenome-generate/merge-metadata.cwl b/workflows/pangenome-generate/merge-metadata.cwl index 9164c09..4d9c808 100644 --- a/workflows/pangenome-generate/merge-metadata.cwl +++ b/workflows/pangenome-generate/merge-metadata.cwl @@ -5,14 +5,48 @@ hints: dockerPull: commonworkflowlanguage/cwltool_module inputs: metadata: File[] - metadataSchema: File subjects: string[] + metadataSchema: + type: File + inputBinding: {position: 2} + originalLabels: + type: File + inputBinding: {position: 3} + dups: + type: File? + inputBinding: {position: 4} + script: + type: File + inputBinding: {position: 1} + default: {class: File, location: merge-metadata.py} outputs: merged: stdout stdout: mergedmetadata.ttl requirements: + InlineJavascriptRequirement: {} InitialWorkDirRequirement: - listing: - - entry: {$include: merge-metadata.py} - entryname: merge-metadata.py -baseCommand: [python3, merge-metadata.py] + listing: | + ${ + var i = 0; + var b = 1; + var out = []; + for (; i < inputs.metadata.length; i++) { + var block = []; + var sub = []; + for (; i < (b*150) && i < inputs.metadata.length; i++) { + block.push(inputs.metadata[i]); + sub.push(inputs.subjects[i]); + } + out.push({ + entryname: "block"+b, + entry: JSON.stringify(block) + }); + out.push({ + entryname: "subs"+b, + entry: JSON.stringify(sub) + }); + b++; + } + return out; + } +baseCommand: python diff --git a/workflows/pangenome-generate/merge-metadata.py b/workflows/pangenome-generate/merge-metadata.py index 64275b1..65d08a6 100644 --- a/workflows/pangenome-generate/merge-metadata.py +++ b/workflows/pangenome-generate/merge-metadata.py @@ -1,9 +1,28 @@ +import re import schema_salad.schema import schema_salad.jsonld_context +import json +import sys +import os +import logging -metadataSchema = '$(inputs.metadataSchema.path)' -metadata = $(inputs.metadata) -subjects = $(inputs.subjects) +metadataSchema = sys.argv[1] +originalLabels = sys.argv[2] +dups = None +if len(sys.argv) == 4: + dups = sys.argv[3] + +def readitems(stem): + items = [] + b = 1 + while os.path.exists("%s%i" % (stem, b)): + with open("%s%i" % (stem, b)) as f: + items.extend(json.load(f)) + b += 1 + return items + +metadata = readitems("block") +subjects = readitems("subs") (document_loader, avsc_names, @@ -11,7 +30,20 @@ subjects = $(inputs.subjects) metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) for i, m in enumerate(metadata): - doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], True) + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False) doc["id"] = subjects[i] g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx) print(g.serialize(format="ntriples").decode("utf-8")) + +if dups: + sameseqs = open(dups, "rt") + for d in sameseqs: + logging.warn(d) + g = re.match(r"\d+\t(.*)", d) + logging.warn("%s", g.group(1)) + sp = g.group(1).split(",") + for n in sp[1:]: + print("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> ." % (n.strip(), sp[0].strip())) + +orig = open(originalLabels, "rt") +print(orig.read()) diff --git a/workflows/pangenome-generate/minimap2.cwl b/workflows/pangenome-generate/minimap2.cwl index bf19ef7..42d1dce 100644 --- a/workflows/pangenome-generate/minimap2.cwl +++ b/workflows/pangenome-generate/minimap2.cwl @@ -12,7 +12,7 @@ hints: ResourceRequirement: coresMin: 8 coresMax: 32 - ramMin: $(7 * 1024) + ramMin: $(9 * 1024) outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) stdout: $(inputs.readsFA.nameroot).paf baseCommand: minimap2 diff --git a/workflows/pangenome-generate/pangenome-generate.cwl b/workflows/pangenome-generate/pangenome-generate.cwl index 2710743..6794e2d 100644 --- a/workflows/pangenome-generate/pangenome-generate.cwl +++ b/workflows/pangenome-generate/pangenome-generate.cwl @@ -18,13 +18,22 @@ outputs: odgiRDF: type: File outputSource: odgi2rdf/rdf + readsMergeDedup: + type: File + outputSource: dedup/readsMergeDedup mergedMetadata: type: File outputSource: mergeMetadata/merged steps: + relabel: + in: + readsFA: inputReads + subjects: subjects + out: [relabeledSeqs, originalLabels] + run: relabel-seqs.cwl dedup: - in: {readsFA: inputReads} - out: [readsMergeDedup] + in: {readsFA: relabel/relabeledSeqs} + out: [readsMergeDedup, dups] run: seqkit-rmdup.cwl overlapReads: in: {readsFA: dedup/readsMergeDedup} @@ -53,5 +62,7 @@ steps: metadata: metadata metadataSchema: metadataSchema subjects: subjects + dups: dedup/dups + originalLabels: relabel/originalLabels out: [merged] run: merge-metadata.cwl diff --git a/workflows/pangenome-generate/relabel-seqs.cwl b/workflows/pangenome-generate/relabel-seqs.cwl new file mode 100644 index 0000000..c1f17a4 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.cwl @@ -0,0 +1,50 @@ +cwlVersion: v1.1 +class: CommandLineTool +inputs: + readsFA: File[] + subjects: string[] + script: + type: File + default: {class: File, location: relabel-seqs.py} + inputBinding: {} +outputs: + relabeledSeqs: + type: File + outputBinding: + glob: relabeledSeqs.fasta + originalLabels: + type: File + outputBinding: + glob: originalLabels.ttl +requirements: + InlineJavascriptRequirement: {} + InitialWorkDirRequirement: + listing: | + ${ + var i = 0; + var b = 1; + var out = []; + for (; i < inputs.readsFA.length; i++) { + var block = []; + var sub = []; + for (; i < (b*150) && i < inputs.readsFA.length; i++) { + block.push(inputs.readsFA[i]); + sub.push(inputs.subjects[i]); + } + out.push({ + entryname: "block"+b, + entry: JSON.stringify(block) + }); + out.push({ + entryname: "subs"+b, + entry: JSON.stringify(sub) + }); + b++; + } + return out; + } +hints: + DockerRequirement: + dockerPull: commonworkflowlanguage/cwltool_module +stdout: +baseCommand: [python] diff --git a/workflows/pangenome-generate/relabel-seqs.py b/workflows/pangenome-generate/relabel-seqs.py new file mode 100644 index 0000000..6b022a0 --- /dev/null +++ b/workflows/pangenome-generate/relabel-seqs.py @@ -0,0 +1,30 @@ +import os +import json + +def readitems(stem): + items = [] + b = 1 + while os.path.exists("%s%i" % (stem, b)): + with open("%s%i" % (stem, b)) as f: + items.extend(json.load(f)) + b += 1 + return items + +reads = readitems("block") +subjects = readitems("subs") + +relabeled_fasta = open("relabeledSeqs.fasta", "wt") +original_labels = open("originalLabels.ttl", "wt") + +for i, r in enumerate(reads): + with open(r["path"], "rt") as fa: + label = fa.readline() + original_labels.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subjects[i], label[1:].strip().replace('"', '\\"'))) + relabeled_fasta.write(">"+subjects[i]+"\n") + data = fa.read(8096) + while data: + relabeled_fasta.write(data) + endswithnewline = data.endswith("\n") + data = fa.read(8096) + if not endswithnewline: + relabeled_fasta.write("\n") diff --git a/workflows/pangenome-generate/seqkit-rmdup.cwl b/workflows/pangenome-generate/seqkit-rmdup.cwl index d3626f5..071fa66 100644 --- a/workflows/pangenome-generate/seqkit-rmdup.cwl +++ b/workflows/pangenome-generate/seqkit-rmdup.cwl @@ -1,12 +1,16 @@ cwlVersion: v1.1 class: CommandLineTool inputs: - readsFA: File[] + readsFA: File outputs: readsMergeDedup: type: File outputBinding: glob: readsMergeDedup.fasta + dups: + type: File? + outputBinding: + glob: dups.txt requirements: InlineJavascriptRequirement: {} hints: @@ -28,5 +32,6 @@ baseCommand: seqkit arguments: [rmdup, --by-seq, --ignore-case, + --dup-num-file, dups.txt, -o, readsMergeDedup.fasta, $(inputs.readsFA)] diff --git a/workflows/pangenome-generate/testjob.yml b/workflows/pangenome-generate/testjob.yml new file mode 100644 index 0000000..a48aff8 --- /dev/null +++ b/workflows/pangenome-generate/testjob.yml @@ -0,0 +1,16 @@ +inputReads: + - class: File + location: ../../example/sequence.fasta + - class: File + location: ../../example/sequence.fasta +metadata: + - class: File + location: ../../example/metadata.yaml + - class: File + location: ../../example/metadata.yaml +metadataSchema: + class: File + location: ../../bh20sequploader/bh20seq-schema.yml +subjects: + - http://arvados.org/keep/seq1 + - http://arvados.org/keep/seq2 |