From b5143c79de268b844f3a6a63d92c6389b047f35e Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Mon, 9 Nov 2020 16:55:33 -0500
Subject: Make it so "pangenome analysis" only runs collect-seqs.

Will ensure that metadata is kept up to date.

GFA isn't being generated.  Will introduce new workflow that uses
from_sparql to analyze a subset.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20seqanalyzer/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'bh20seqanalyzer')

diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 5f00080..0906958 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -215,7 +215,7 @@ class SeqAnalyzer:
         most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]],
                                                       order="created_at desc").execute()
         for m in most_recent_analysis["items"]:
-            wf = self.get_workflow_output_from_project(m["uuid"], "arv-main.cwl")
+            wf = self.get_workflow_output_from_project(m["uuid"], "collect-seqs.cwl")
             if wf is None:
                 continue
             src = self.api.collections().get(uuid=wf["output_uuid"]).execute()
-- 
cgit v1.2.3


From c872248e43c1c66e5fed8ef341f7b4ac21d63e6f Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Tue, 10 Nov 2020 17:26:56 -0500
Subject: Valid uploads with sequence_label in PubSeq replaces old one.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 README.md               |  4 +++-
 bh20seqanalyzer/main.py | 50 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'bh20seqanalyzer')

diff --git a/README.md b/README.md
index 3815bf4..9007350 100644
--- a/README.md
+++ b/README.md
@@ -146,9 +146,11 @@ For running/developing the uploader with GNU Guix see [INSTALL.md](./doc/INSTALL
 Run the uploader with a FASTA or FASTQ file and accompanying metadata file in JSON or YAML:
 
 ```sh
-bh20-seq-uploader example/sequence.fasta example/metadata.yaml
+bh20-seq-uploader example/metadata.yaml example/sequence.fasta
 ```
 
+If the sample_id of your upload matches a sample already in PubSeq, it will be considered a new version and supercede the existing entry.
+
 ## Workflow for Generating a Pangenome
 
 All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself.
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 0906958..b54a746 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -42,11 +42,11 @@ class SeqAnalyzer:
         self.schema_ref = None
 
     def validate_upload(self, collection, revalidate):
-        col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient)
-
         if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
             return False
 
+        col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient)
+
         # validate the collection here.  Check metadata, etc.
         logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
 
@@ -98,19 +98,7 @@ class SeqAnalyzer:
             except Exception as v:
                 errors.append(str(v))
 
-
-        if not errors:
-            # Move it to the "validated" project to be included in the next analysis
-            if "errors" in collection["properties"]:
-                del collection["properties"]["errors"]
-            collection["properties"]["status"] = "validated"
-            self.api.collections().update(uuid=collection["uuid"], body={
-                "owner_uuid": self.validated_project,
-                "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
-                "properties": collection["properties"]}).execute()
-            logging.info("Added '%s' to validated sequences" % collection["name"])
-            return True
-        else:
+        if errors:
             # It is invalid
             logging.warn("'%s' (%s) has validation errors: %s" % (
                 collection["name"], collection["uuid"], "\n".join(errors)))
@@ -119,6 +107,38 @@ class SeqAnalyzer:
             self.api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
             return False
 
+        existing = self.api.collections().list(filters=[["owner_uuid", "=", self.validated_project],
+                                                        ["properties.sequence_label", "=", sample_id]]).execute()
+
+        update_from = None
+        if existing["items"]:
+            # "collection" is the newly uploaded one we're looking at
+            update_from = collection
+            collection = existing["items"][0]
+            collection["properties"] = update_from["properties"]
+
+        if "errors" in collection["properties"]:
+            del collection["properties"]["errors"]
+        collection["properties"]["status"] = "validated"
+        collection["properties"]["sequence_label"] = sample_id
+
+        if update_from:
+            self.api.collections().update(uuid=collection["uuid"], body={
+                "properties": collection["properties"],
+                "manifest_text": col.manifest_text()
+            }).execute()
+            self.api.collections().delete(uuid=update_from["uuid"]).execute()
+            logging.info("Updated '%s' in validated sequences" % collection["name"])
+        else:
+            # Move it to the "validated" project to be included in the next analysis
+            self.api.collections().update(uuid=collection["uuid"], body={
+                "owner_uuid": self.validated_project,
+                "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
+                "properties": collection["properties"]}).execute()
+            logging.info("Added '%s' to validated sequences" % collection["name"])
+
+        return True
+
 
     def run_workflow(self, parent_project, workflow_uuid, name, inputobj):
         project = self.api.groups().create(body={
-- 
cgit v1.2.3