aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Amstutz2020-08-19 15:19:07 -0400
committerPeter Amstutz2020-08-19 16:31:38 -0400
commitb3d2ccf951903ac0b7d717357fb1cccca26fbd15 (patch)
tree92adcacbd208546bf3f063e6c25765e660972244
parent592c921a3223c03d8a22f7a852641ac5d753fb31 (diff)
downloadbh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.tar.gz
bh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.tar.lz
bh20-seq-resource-b3d2ccf951903ac0b7d717357fb1cccca26fbd15.zip
Consolidate steps to scale graph generation workflow
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
-rw-r--r--bh20seqanalyzer/main.py21
-rw-r--r--workflows/pangenome-generate/arv-main.cwl48
-rw-r--r--workflows/pangenome-generate/collect-seqs.cwl42
-rw-r--r--workflows/pangenome-generate/collect-seqs.py67
-rw-r--r--workflows/pangenome-generate/dups2metadata.cwl19
-rw-r--r--workflows/pangenome-generate/dups2metadata.py17
-rw-r--r--workflows/pangenome-generate/pangenome-generate_spoa.cwl25
-rw-r--r--workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl4
-rw-r--r--workflows/pangenome-generate/spoa.cwl4
-rwxr-xr-xworkflows/update-workflows.sh2
10 files changed, 209 insertions, 40 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index ad3c3d7..13efbeb 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -181,13 +181,7 @@ class SeqAnalyzer:
if self.schema_ref is None:
self.upload_schema()
- validated = arvados.util.list_all(self.api.collections().list, filters=[
- ["owner_uuid", "=", self.validated_project],
- ["properties.status", "=", "validated"]])
inputobj = {
- "inputReads": [],
- "metadata": [],
- "subjects": [],
"metadataSchema": {
"class": "File",
"location": self.schema_ref
@@ -195,19 +189,10 @@ class SeqAnalyzer:
"exclude": {
"class": "File",
"location": self.exclude_list
- }
+ },
+ "src_project": self.validated_project
}
- validated.sort(key=lambda v: v["portable_data_hash"])
- for v in validated:
- inputobj["inputReads"].append({
- "class": "File",
- "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
- })
- inputobj["metadata"].append({
- "class": "File",
- "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
- })
- inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
+
self.run_workflow(self.pangenome_analysis_project, self.pangenome_workflow_uuid, "Pangenome analysis", inputobj)
diff --git a/workflows/pangenome-generate/arv-main.cwl b/workflows/pangenome-generate/arv-main.cwl
new file mode 100644
index 0000000..176cfe7
--- /dev/null
+++ b/workflows/pangenome-generate/arv-main.cwl
@@ -0,0 +1,48 @@
+cwlVersion: v1.1
+class: Workflow
+requirements:
+ SubworkflowFeatureRequirement: {}
+inputs:
+ src_project: string
+ metadataSchema: File
+ exclude: File?
+outputs:
+ odgiGraph:
+ type: File
+ outputSource: pangenome-generate/odgiGraph
+ odgiPNG:
+ type: File
+ outputSource: pangenome-generate/odgiPNG
+ spoaGFA:
+ type: File
+ outputSource: pangenome-generate/spoaGFA
+ odgiRDF:
+ type: File
+ outputSource: pangenome-generate/odgiRDF
+ readsMergeDedup:
+ type: File
+ outputSource: pangenome-generate/readsMergeDedup
+ mergedMetadata:
+ type: File
+ outputSource: pangenome-generate/mergedMetadata
+ indexed_paths:
+ type: File
+ outputSource: pangenome-generate/indexed_paths
+ colinear_components:
+ type: Directory
+ outputSource: pangenome-generate/colinear_components
+steps:
+ collect-seqs:
+ run: collect-seqs.cwl
+ in:
+ src_project: src_project
+ schema: metadataSchema
+ exclude: exclude
+ out: [relabeledSeqs, mergedMetadata]
+ pangenome-generate:
+ run: pangenome-generate_spoa.cwl
+ in:
+ seqs: collect-seqs/relabeledSeqs
+ metadata: collect-seqs/mergedMetadata
+ exclude: exclude
+ out: [odgiGraph, odgiPNG, spoaGFA, odgiRDF, readsMergeDedup, mergedMetadata, indexed_paths, colinear_components]
diff --git a/workflows/pangenome-generate/collect-seqs.cwl b/workflows/pangenome-generate/collect-seqs.cwl
new file mode 100644
index 0000000..3511df1
--- /dev/null
+++ b/workflows/pangenome-generate/collect-seqs.cwl
@@ -0,0 +1,42 @@
+cwlVersion: v1.1
+class: CommandLineTool
+$namespaces:
+ arv: "http://arvados.org/cwl#"
+ cwltool: "http://commonwl.org/cwltool#"
+requirements:
+ arv:APIRequirement: {}
+ arv:RuntimeConstraints:
+ outputDirType: keep_output_dir
+ DockerRequirement:
+ dockerPull: arvados/jobs:2.0.3
+ WorkReuse:
+ enableReuse: false
+ ResourceRequirement:
+ coresMin: 1
+ ramMin: 1024
+baseCommand: python3
+inputs:
+ script:
+ type: File
+ default:
+ class: File
+ location: collect-seqs.py
+ inputBinding: {position: 1}
+ src_project:
+ type: string
+ inputBinding: {position: 2}
+ schema:
+ type: File
+ inputBinding: {position: 3}
+ exclude:
+ type: File?
+ inputBinding: {position: 4}
+outputs:
+ relabeledSeqs:
+ type: File
+ outputBinding:
+ glob: relabeledSeqs.fasta
+ mergedMetadata:
+ type: File
+ outputBinding:
+ glob: mergedMetadata.ttl
diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py
new file mode 100644
index 0000000..af4a0dc
--- /dev/null
+++ b/workflows/pangenome-generate/collect-seqs.py
@@ -0,0 +1,67 @@
+import sys
+import arvados
+import json
+import shutil
+import arvados.collection
+import ruamel.yaml
+import schema_salad.schema
+import schema_salad.jsonld_context
+from schema_salad.sourceline import add_lc_filename
+
+api = arvados.api()
+keepclient = arvados.keep.KeepClient(api_client=api)
+
+validated = arvados.util.list_all(api.collections().list, filters=[
+ ["owner_uuid", "=", sys.argv[1]],
+ ["properties.status", "=", "validated"]])
+
+validated.sort(key=lambda v: v["portable_data_hash"])
+
+relabeled_fasta = open("relabeledSeqs.fasta", "wt")
+merged_metadata = open("mergedMetadata.ttl", "wt")
+
+metadataSchema = sys.argv[2]
+
+blacklist = set()
+if len(sys.argv) > 3:
+ with open(sys.argv[3]) as bl:
+ for l in bl:
+ blacklist.add(l.strip())
+
+(document_loader,
+ avsc_names,
+ schema_metadata,
+ metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)
+
+
+for item in validated:
+ pdh = item["portable_data_hash"]
+ with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col:
+ with col.open("sequence.fasta", "rt") as fa:
+ subject = "http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % pdh
+ label = fa.readline().strip()
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"')))
+ skip = (subject in blacklist or label[1:] in blacklist)
+ if skip:
+ merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject)
+ if not skip:
+ relabeled_fasta.write(">"+subject+"\n")
+ data = fa.read(8096)
+ while data:
+ if not skip:
+ relabeled_fasta.write(data)
+ endswithnewline = data.endswith("\n")
+ data = fa.read(8096)
+ if not skip and not endswithnewline:
+ relabeled_fasta.write("\n")
+
+ with col.open("metadata.yaml", "rt") as md:
+ metadata_content = ruamel.yaml.round_trip_load(md)
+ metadata_content["id"] = subject
+ add_lc_filename(metadata_content, metadata_content["id"])
+ doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False)
+ g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx)
+ merged_metadata.write(g.serialize(format="ntriples").decode("utf-8"))
+
+
+shutil.rmtree(".cache")
diff --git a/workflows/pangenome-generate/dups2metadata.cwl b/workflows/pangenome-generate/dups2metadata.cwl
new file mode 100644
index 0000000..cf54675
--- /dev/null
+++ b/workflows/pangenome-generate/dups2metadata.cwl
@@ -0,0 +1,19 @@
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python
+inputs:
+ script:
+ type: File
+ default:
+ class: File
+ location: dups2metadata.py
+ inputBinding: {position: 1}
+ metadata:
+ type: File
+ inputBinding: {position: 2}
+ dups:
+ type: File?
+ inputBinding: {position: 3}
+stdout: mergedmetadata.ttl
+outputs:
+ merged: stdout
diff --git a/workflows/pangenome-generate/dups2metadata.py b/workflows/pangenome-generate/dups2metadata.py
new file mode 100644
index 0000000..9bda10a
--- /dev/null
+++ b/workflows/pangenome-generate/dups2metadata.py
@@ -0,0 +1,17 @@
+import sys
+
+md = open(sys.argv[1], "rt")
+for d in md:
+ print(d)
+
+if len(sys.argv) < 3:
+ exit(0)
+
+sameseqs = open(sys.argv[2], "rt")
+for d in sameseqs:
+ logging.warn(d)
+ g = re.match(r"\d+\t(.*)", d)
+ logging.warn("%s", g.group(1))
+ sp = g.group(1).split(",")
+ for n in sp[1:]:
+ print("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> ." % (n.strip(), sp[0].strip()))
diff --git a/workflows/pangenome-generate/pangenome-generate_spoa.cwl b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
index 8b34ff8..33bf64e 100644
--- a/workflows/pangenome-generate/pangenome-generate_spoa.cwl
+++ b/workflows/pangenome-generate/pangenome-generate_spoa.cwl
@@ -5,11 +5,8 @@ requirements:
ScatterFeatureRequirement: {}
StepInputExpressionRequirement: {}
inputs:
- inputReads: File[]
- metadata: File[]
- metadataSchema: File
- subjects: string[]
- exclude: File?
+ seqs: File
+ metadata: File
bin_widths:
type: int[]
default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
@@ -36,7 +33,7 @@ outputs:
outputSource: dedup_and_sort_by_quality_and_len/reads_dedupped_sorted_by_quality_and_len
mergedMetadata:
type: File
- outputSource: mergeMetadata/merged
+ outputSource: dups2metadata/merged
indexed_paths:
type: File
outputSource: index_paths/indexed_paths
@@ -44,15 +41,8 @@ outputs:
type: Directory
outputSource: segment_components/colinear_components
steps:
- relabel:
- in:
- readsFA: inputReads
- subjects: subjects
- exclude: exclude
- out: [relabeledSeqs, originalLabels]
- run: relabel-seqs.cwl
dedup_and_sort_by_quality_and_len:
- in: {reads: relabel/relabeledSeqs}
+ in: {reads: seqs}
out: [reads_dedupped_sorted_by_quality_and_len, dups]
run: sort_fasta_by_quality_and_len.cwl
induceGraph:
@@ -81,15 +71,12 @@ steps:
in: {odgi: buildGraph/odgiGraph}
out: [rdf]
run: odgi_to_rdf.cwl
- mergeMetadata:
+ dups2metadata:
in:
metadata: metadata
- metadataSchema: metadataSchema
- subjects: subjects
dups: dedup_and_sort_by_quality_and_len/dups
- originalLabels: relabel/originalLabels
out: [merged]
- run: merge-metadata.cwl
+ run: dups2metadata.cwl
bin_paths:
run: ../tools/odgi/odgi_bin.cwl
in:
diff --git a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
index 59f027e..f8da5d3 100644
--- a/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
+++ b/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl
@@ -1,5 +1,9 @@
cwlVersion: v1.1
class: CommandLineTool
+hints:
+ ResourceRequirement:
+ coresMin: 1
+ ramMin: 1024
inputs:
readsFA:
type: File
diff --git a/workflows/pangenome-generate/spoa.cwl b/workflows/pangenome-generate/spoa.cwl
index 1e390d8..132633c 100644
--- a/workflows/pangenome-generate/spoa.cwl
+++ b/workflows/pangenome-generate/spoa.cwl
@@ -2,8 +2,7 @@ cwlVersion: v1.1
class: CommandLineTool
inputs:
readsFA: File
-stdout: $(inputs.readsFA.nameroot).g6.gfa
-script:
+ script:
type: File
default: {class: File, location: relabel-seqs.py}
outputs:
@@ -20,6 +19,7 @@ hints:
ramMin: $(15 * 1024)
outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
baseCommand: spoa
+stdout: $(inputs.readsFA.nameroot).g6.gfa
arguments: [
$(inputs.readsFA),
-G,
diff --git a/workflows/update-workflows.sh b/workflows/update-workflows.sh
index ea9e199..3b69a58 100755
--- a/workflows/update-workflows.sh
+++ b/workflows/update-workflows.sh
@@ -1,3 +1,3 @@
#!/bin/sh
arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-2zp9q4jo5xpif9y fastq2fasta/fastq2fasta.cwl
-arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/pangenome-generate.cwl
+arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/arv-main.cwl