From b5143c79de268b844f3a6a63d92c6389b047f35e Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Mon, 9 Nov 2020 16:55:33 -0500 Subject: Make it so "pangenome analysis" only runs collect-seqs. Will ensure that metadata is kept up to date. GFA isn't being generated. Will introduce new workflow that uses from_sparql to analyze a subset. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20seqanalyzer/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'bh20seqanalyzer') diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 5f00080..0906958 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -215,7 +215,7 @@ class SeqAnalyzer: most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]], order="created_at desc").execute() for m in most_recent_analysis["items"]: - wf = self.get_workflow_output_from_project(m["uuid"], "arv-main.cwl") + wf = self.get_workflow_output_from_project(m["uuid"], "collect-seqs.cwl") if wf is None: continue src = self.api.collections().get(uuid=wf["output_uuid"]).execute() -- cgit v1.2.3 From c872248e43c1c66e5fed8ef341f7b4ac21d63e6f Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 10 Nov 2020 17:26:56 -0500 Subject: Valid uploads with sequence_label in PubSeq replaces old one. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- README.md | 4 +++- bh20seqanalyzer/main.py | 50 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 16 deletions(-) (limited to 'bh20seqanalyzer') diff --git a/README.md b/README.md index 3815bf4..9007350 100644 --- a/README.md +++ b/README.md @@ -146,9 +146,11 @@ For running/developing the uploader with GNU Guix see [INSTALL.md](./doc/INSTALL Run the uploader with a FASTA or FASTQ file and accompanying metadata file in JSON or YAML: ```sh -bh20-seq-uploader example/sequence.fasta example/metadata.yaml +bh20-seq-uploader example/metadata.yaml example/sequence.fasta ``` +If the sample_id of your upload matches a sample already in PubSeq, it will be considered a new version and supercede the existing entry. + ## Workflow for Generating a Pangenome All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 0906958..b54a746 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -42,11 +42,11 @@ class SeqAnalyzer: self.schema_ref = None def validate_upload(self, collection, revalidate): - col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient) - if not revalidate and collection["properties"].get("status") in ("validated", "rejected"): return False + col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient) + # validate the collection here. Check metadata, etc. logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"])) @@ -98,19 +98,7 @@ class SeqAnalyzer: except Exception as v: errors.append(str(v)) - - if not errors: - # Move it to the "validated" project to be included in the next analysis - if "errors" in collection["properties"]: - del collection["properties"]["errors"] - collection["properties"]["status"] = "validated" - self.api.collections().update(uuid=collection["uuid"], body={ - "owner_uuid": self.validated_project, - "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())), - "properties": collection["properties"]}).execute() - logging.info("Added '%s' to validated sequences" % collection["name"]) - return True - else: + if errors: # It is invalid logging.warn("'%s' (%s) has validation errors: %s" % ( collection["name"], collection["uuid"], "\n".join(errors))) @@ -119,6 +107,38 @@ class SeqAnalyzer: self.api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute() return False + existing = self.api.collections().list(filters=[["owner_uuid", "=", self.validated_project], + ["properties.sequence_label", "=", sample_id]]).execute() + + update_from = None + if existing["items"]: + # "collection" is the newly uploaded one we're looking at + update_from = collection + collection = existing["items"][0] + collection["properties"] = update_from["properties"] + + if "errors" in collection["properties"]: + del collection["properties"]["errors"] + collection["properties"]["status"] = "validated" + collection["properties"]["sequence_label"] = sample_id + + if update_from: + self.api.collections().update(uuid=collection["uuid"], body={ + "properties": collection["properties"], + "manifest_text": col.manifest_text() + }).execute() + self.api.collections().delete(uuid=update_from["uuid"]).execute() + logging.info("Updated '%s' in validated sequences" % collection["name"]) + else: + # Move it to the "validated" project to be included in the next analysis + self.api.collections().update(uuid=collection["uuid"], body={ + "owner_uuid": self.validated_project, + "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())), + "properties": collection["properties"]}).execute() + logging.info("Added '%s' to validated sequences" % collection["name"]) + + return True + def run_workflow(self, parent_project, workflow_uuid, name, inputobj): project = self.api.groups().create(body={ -- cgit v1.2.3