From 0b0fb1c8a68df989bb2e1f593d717ac62e31d952 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 19 Oct 2020 21:04:19 -0400 Subject: Extract subset of the all-sequences fasta by running a sparql query. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/from_sparql.cwl | 23 +++++++++++++++++++++++ workflows/pangenome-generate/from_sparql.py | 8 ++++++++ 2 files changed, 31 insertions(+) create mode 100644 workflows/pangenome-generate/from_sparql.cwl create mode 100644 workflows/pangenome-generate/from_sparql.py (limited to 'workflows') diff --git a/workflows/pangenome-generate/from_sparql.cwl b/workflows/pangenome-generate/from_sparql.cwl new file mode 100644 index 0000000..5bc0792 --- /dev/null +++ b/workflows/pangenome-generate/from_sparql.cwl @@ -0,0 +1,23 @@ +cwlVersion: v1.1 +class: CommandLineTool +requirements: + DockerRequirement: + dockerFile: | + FROM debian:10 + RUN apt-get update && apt-get -yq --no-install-recommends install samtools python3-rdflib + dockerImageId: rdflib-and-samtools +inputs: + script: + type: File + default: + class: File + location: from_sparql.py + metadata: File + fasta: + type: File + secondaryFiles: [.fai] + query: string +stdout: selected.fasta +outputs: + selected: stdout +arguments: [python3, $(inputs.script), $(inputs.metadata), $(inputs.fasta), $(inputs.query)] diff --git a/workflows/pangenome-generate/from_sparql.py b/workflows/pangenome-generate/from_sparql.py new file mode 100644 index 0000000..4610cad --- /dev/null +++ b/workflows/pangenome-generate/from_sparql.py @@ -0,0 +1,8 @@ +from rdflib import Graph +import sys +import subprocess +g = Graph() +g.parse(sys.argv[1], format="nt") +res = g.query(sys.argv[3]) +for r in res: + subprocess.run(["samtools", "faidx", sys.argv[2], r[0]]) -- cgit v1.2.3 From 0d1831449d7541e0ec36fd75793915d144a02b5d Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 9 Nov 2020 16:06:01 -0500 Subject: Rename schema param to metadataSchema Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- workflows/pangenome-generate/arv-main.cwl | 2 +- workflows/pangenome-generate/collect-seqs.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'workflows') diff --git a/workflows/pangenome-generate/arv-main.cwl b/workflows/pangenome-generate/arv-main.cwl index dae47e6..1d71ee3 100644 --- a/workflows/pangenome-generate/arv-main.cwl +++ b/workflows/pangenome-generate/arv-main.cwl @@ -36,7 +36,7 @@ steps: run: collect-seqs.cwl in: src_project: src_project - schema: metadataSchema + metadataSchema: metadataSchema exclude: exclude out: [relabeledSeqs, mergedMetadata] pangenome-generate: diff --git a/workflows/pangenome-generate/collect-seqs.cwl b/workflows/pangenome-generate/collect-seqs.cwl index 3511df1..635108f 100644 --- a/workflows/pangenome-generate/collect-seqs.cwl +++ b/workflows/pangenome-generate/collect-seqs.cwl @@ -25,7 +25,7 @@ inputs: src_project: type: string inputBinding: {position: 2} - schema: + metadataSchema: type: File inputBinding: {position: 3} exclude: -- cgit v1.2.3 From b5143c79de268b844f3a6a63d92c6389b047f35e Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 9 Nov 2020 16:55:33 -0500 Subject: Make it so "pangenome analysis" only runs collect-seqs. Will ensure that metadata is kept up to date. GFA isn't being generated. Will introduce new workflow that uses from_sparql to analyze a subset. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20seqanalyzer/main.py | 2 +- bh20simplewebuploader/templates/resource.html | 4 ++-- workflows/update-workflows.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'workflows') diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 5f00080..0906958 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -215,7 +215,7 @@ class SeqAnalyzer: most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]], order="created_at desc").execute() for m in most_recent_analysis["items"]: - wf = self.get_workflow_output_from_project(m["uuid"], "arv-main.cwl") + wf = self.get_workflow_output_from_project(m["uuid"], "collect-seqs.cwl") if wf is None: continue src = self.api.collections().get(uuid=wf["output_uuid"]).execute() diff --git a/bh20simplewebuploader/templates/resource.html b/bh20simplewebuploader/templates/resource.html index fc52f13..4c50fb9 100644 --- a/bh20simplewebuploader/templates/resource.html +++ b/bh20simplewebuploader/templates/resource.html @@ -10,8 +10,8 @@

All sequences project

All sequences (FASTA) relabled and deduplicated

Metadata (RDF) for all sequences

-

All sequences in Graphical Fragment Assembly (GFA) - More about GFA

-

All sequences in Optimized Dynamic Genome/Graph Implementation (ODGI) - More about ODGI

+ + diff --git a/workflows/update-workflows.sh b/workflows/update-workflows.sh index 3b69a58..5182ec4 100755 --- a/workflows/update-workflows.sh +++ b/workflows/update-workflows.sh @@ -1,3 +1,3 @@ #!/bin/sh arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-2zp9q4jo5xpif9y fastq2fasta/fastq2fasta.cwl -arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/arv-main.cwl +arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/collect-seqs.cwl -- cgit v1.2.3 From 1082b907d816f5da52aba6233073737632d0242f Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 9 Nov 2020 17:20:27 -0500 Subject: Make resource link work for both portable data hashes and sample id Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20simplewebuploader/main.py | 14 ++++++++++++++ workflows/pangenome-generate/collect-seqs.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'workflows') diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index 73503b4..405544c 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -675,13 +675,27 @@ sparqlURL='http://sparql.genenetwork.org/sparql/' @app.route('/resource/') def resource(id): """Get a COVID19 resource using identifier""" + query=f""" PREFIX pubseq: PREFIX sio: select distinct ?sample ?geoname ?date ?source ?geo ?sampletype ?institute ?sequenceuri +where {{ {{ ?sample sio:SIO_000115 "{id}" . ?sequenceuri pubseq:sample ?sample . +}} +union +{{ + pubseq:sample ?sample . + ?sequenceuri pubseq:sample ?sample . +}} +union +{{ + pubseq:sample ?sample . + ?sequenceuri pubseq:sample ?sample . +}} + ?sample ?geo . ?geo rdfs:label ?geoname . ?sample ?date . diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py index af4a0dc..1a0807c 100644 --- a/workflows/pangenome-generate/collect-seqs.py +++ b/workflows/pangenome-generate/collect-seqs.py @@ -38,7 +38,7 @@ for item in validated: pdh = item["portable_data_hash"] with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: with col.open("sequence.fasta", "rt") as fa: - subject = "http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % pdh + subject = "http://covid19.genenetwork.org/resource/%s" % pdh label = fa.readline().strip() merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) skip = (subject in blacklist or label[1:] in blacklist) -- cgit v1.2.3 From c01188ec20936462357b317f81567aadc64c8f33 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 10 Nov 2020 11:52:37 -0500 Subject: Use arvados uuids for RDF subjects. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20simplewebuploader/main.py | 5 +++++ workflows/pangenome-generate/collect-seqs.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'workflows') diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index 51048a4..bcdf8d8 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -695,6 +695,11 @@ union pubseq:sample ?sample . ?sequenceuri pubseq:sample ?sample . }} +union +{{ + ?sequenceuri "{id}" . + ?sequenceuri pubseq:sample ?sample . +}} ?sample ?geo . ?geo rdfs:label ?geoname . diff --git a/workflows/pangenome-generate/collect-seqs.py b/workflows/pangenome-generate/collect-seqs.py index 1a0807c..225a61f 100644 --- a/workflows/pangenome-generate/collect-seqs.py +++ b/workflows/pangenome-generate/collect-seqs.py @@ -36,11 +36,14 @@ if len(sys.argv) > 3: for item in validated: pdh = item["portable_data_hash"] + uuid = item["uuid"] with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: with col.open("sequence.fasta", "rt") as fa: - subject = "http://covid19.genenetwork.org/resource/%s" % pdh + subject = "http://covid19.genenetwork.org/resource/%s" % uuid label = fa.readline().strip() merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) + merged_metadata.write("<%s> \"%s\" .\n" % (subject, pdh)) + merged_metadata.write("<%s> \"%s\" .\n" % (subject, item["version"])) skip = (subject in blacklist or label[1:] in blacklist) if skip: merged_metadata.write("<%s> \"true\"^^ .\n" % subject) -- cgit v1.2.3