about summary refs log tree commit diff
diff options
context:
space:
mode:
authorPeter Amstutz2020-07-16 16:41:33 -0400
committerGitHub2020-07-16 16:41:33 -0400
commiteb3bdab1109959deb5b11fd74310832bdaa50899 (patch)
treea79ace1119a78401c9383b0e107da4f9c1734ba1
parent8d995b271f20d15b2b6a7845ade22c396a383da3 (diff)
parentd49f6b5e11a41a51cb257bbafdcba410544f8486 (diff)
downloadbh20-seq-resource-eb3bdab1109959deb5b11fd74310832bdaa50899.tar.gz
bh20-seq-resource-eb3bdab1109959deb5b11fd74310832bdaa50899.tar.lz
bh20-seq-resource-eb3bdab1109959deb5b11fd74310832bdaa50899.zip
Merge pull request #98 from arvados/analysis-refactor
Analysis refactor
-rw-r--r--bh20seqanalyzer/main.py618
-rw-r--r--bh20sequploader/bh20seq-schema.yml4
-rw-r--r--bh20sequploader/bh20seq-shex.rdf4
-rw-r--r--bh20sequploader/main.py22
-rw-r--r--bh20simplewebuploader/main.py222
-rw-r--r--bh20simplewebuploader/static/main.css17
-rw-r--r--bh20simplewebuploader/templates/error.html2
-rw-r--r--bh20simplewebuploader/templates/footer.html5
-rw-r--r--bh20simplewebuploader/templates/status.html3
-rw-r--r--bh20simplewebuploader/templates/success.html2
-rw-r--r--bh20simplewebuploader/templates/validated.html17
-rw-r--r--scripts/cleanup.py38
12 files changed, 562 insertions, 392 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 0b52e6b..b3a439d 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -16,277 +16,308 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%
                     level=logging.INFO)
 logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN)
 
-def validate_upload(api, collection, validated_project,
-                    fastq_project, fastq_workflow_uuid,
-                    revalidate):
-    col = arvados.collection.Collection(collection["uuid"])
-
-    if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
-        return False
-
-    # validate the collection here.  Check metadata, etc.
-    logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
-
-    errors = []
-
-    if collection["owner_uuid"] != validated_project:
-        dup = api.collections().list(filters=[["owner_uuid", "=", validated_project],
-                                              ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
-        if dup["items"]:
-            # This exact collection has been uploaded before.
-            errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
-
-    if not errors:
-        if "metadata.yaml" not in col:
-            errors.append("Missing metadata.yaml", collection["name"])
-        else:
+class SeqAnalyzer:
+
+    def __init__(self, api, keepclient,
+                 uploader_project,
+                 pangenome_analysis_project,
+                 fastq_project,
+                 validated_project,
+                 workflow_def_project,
+                 pangenome_workflow_uuid,
+                 fastq_workflow_uuid,
+                 exclude_list,
+                 latest_result_collection):
+        self.api = api
+        self.keepclient = keepclient
+        self.uploader_project = uploader_project
+        self.pangenome_analysis_project = pangenome_analysis_project
+        self.fastq_project = fastq_project
+        self.validated_project = validated_project
+        self.workflow_def_project = workflow_def_project
+        self.pangenome_workflow_uuid = pangenome_workflow_uuid
+        self.fastq_workflow_uuid = fastq_workflow_uuid
+        self.exclude_list = exclude_list
+        self.latest_result_uuid = latest_result_collection
+        self.schema_ref = None
+
+    def validate_upload(self, collection, revalidate):
+        col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient)
+
+        if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
+            return False
+
+        # validate the collection here.  Check metadata, etc.
+        logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
+
+        errors = []
+
+        if collection["owner_uuid"] != self.validated_project:
+            dup = self.api.collections().list(filters=[["owner_uuid", "=", self.validated_project],
+                                                  ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
+            if dup["items"]:
+                # This exact collection has been uploaded before.
+                errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+
+        if not errors:
+            if "metadata.yaml" not in col:
+                errors.append("Missing metadata.yaml", collection["name"])
+            else:
+                try:
+                    with col.open("metadata.yaml") as md:
+                        metadata_content = ruamel.yaml.round_trip_load(md)
+                    metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
+                    sample_id = metadata_content["sample"]["sample_id"]
+                    add_lc_filename(metadata_content, metadata_content["id"])
+                    valid = qc_metadata(metadata_content)
+                    if not valid:
+                        errors.append("Failed metadata qc")
+                except Exception as e:
+                    errors.append(str(e))
+
+        if not errors:
             try:
-                metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml"))
-                metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
-                sample_id = metadata_content["sample"]["sample_id"]
-                add_lc_filename(metadata_content, metadata_content["id"])
-                valid = qc_metadata(metadata_content)
-                if not valid:
-                    errors.append("Failed metadata qc")
-            except Exception as e:
-                errors.append(str(e))
-
-    if not errors:
-        try:
-            tgt = None
-            paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
-            for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
-                if n not in col:
-                    continue
-                with col.open(n, 'rb') as qf:
-                    tgt = qc_fasta(qf)[0]
-                    if tgt != n and tgt != paired.get(n):
-                        errors.append("Expected %s but magic says it should be %s", n, tgt)
-                    elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
-                        start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id)
-                        return False
-            if tgt is None:
-                errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
-        except Exception as v:
-            errors.append(str(v))
-
-
-    if not errors:
-        # Move it to the "validated" project to be included in the next analysis
-        if "errors" in collection["properties"]:
-            del collection["properties"]["errors"]
-        collection["properties"]["status"] = "validated"
-        api.collections().update(uuid=collection["uuid"], body={
-            "owner_uuid": validated_project,
-            "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
-            "properties": collection["properties"]}).execute()
-        logging.info("Added '%s' to validated sequences" % collection["name"])
-        return True
-    else:
-        # It is invalid
-        logging.warn("'%s' (%s) has validation errors: %s" % (
-            collection["name"], collection["uuid"], "\n".join(errors)))
-        collection["properties"]["status"] = "rejected"
-        collection["properties"]["errors"] = errors
-        api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
-        return False
-
-
-def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
-    project = api.groups().create(body={
-        "group_class": "project",
-        "name": name,
-        "owner_uuid": parent_project,
-    }, ensure_unique_name=True).execute()
-
-    with tempfile.NamedTemporaryFile() as tmp:
-        tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
-        tmp.flush()
-        cmd = ["arvados-cwl-runner",
-               "--submit",
-               "--no-wait",
-               "--project-uuid=%s" % project["uuid"],
-               "arvwf:%s" % workflow_uuid,
-               tmp.name]
-        logging.info("Running %s" % ' '.join(cmd))
-        comp = subprocess.run(cmd, capture_output=True)
-    logging.info("Submitted %s", comp.stdout)
-    if comp.returncode != 0:
-        logging.error(comp.stderr.decode('utf-8'))
-
-    return project
-
-
-def start_fastq_to_fasta(api, collection,
-                         analysis_project,
-                         fastq_workflow_uuid,
-                         tgt,
-                         sample_id):
-
-    params = {
-        "metadata": {
-            "class": "File",
-            "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
-        },
-        "ref_fasta": {
-            "class": "File",
-            "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
-        },
-        "sample_id": sample_id
-    }
-
-    if tgt.startswith("reads.fastq"):
-        params["fastq_forward"] = {
-            "class": "File",
-            "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
-        }
-    elif tgt.startswith("reads_1.fastq"):
-        params["fastq_forward"] = {
-            "class": "File",
-            "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
-        }
-        params["fastq_reverse"] = {
-            "class": "File",
-            "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+                tgt = None
+                paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
+                for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                    if n not in col:
+                        continue
+                    with col.open(n, 'rb') as qf:
+                        tgt = qc_fasta(qf)[0]
+                        if tgt != n and tgt != paired.get(n):
+                            errors.append("Expected %s but magic says it should be %s", n, tgt)
+                        elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                            self.start_fastq_to_fasta(collection, n, sample_id)
+                            return False
+                if tgt is None:
+                    errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
+            except Exception as v:
+                errors.append(str(v))
+
+
+        if not errors:
+            # Move it to the "validated" project to be included in the next analysis
+            if "errors" in collection["properties"]:
+                del collection["properties"]["errors"]
+            collection["properties"]["status"] = "validated"
+            self.api.collections().update(uuid=collection["uuid"], body={
+                "owner_uuid": self.validated_project,
+                "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
+                "properties": collection["properties"]}).execute()
+            logging.info("Added '%s' to validated sequences" % collection["name"])
+            return True
+        else:
+            # It is invalid
+            logging.warn("'%s' (%s) has validation errors: %s" % (
+                collection["name"], collection["uuid"], "\n".join(errors)))
+            collection["properties"]["status"] = "rejected"
+            collection["properties"]["errors"] = errors
+            self.api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
+            return False
+
+
+    def run_workflow(self, parent_project, workflow_uuid, name, inputobj):
+        project = self.api.groups().create(body={
+            "group_class": "project",
+            "name": name,
+            "owner_uuid": parent_project,
+        }, ensure_unique_name=True).execute()
+
+        with tempfile.NamedTemporaryFile() as tmp:
+            tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
+            tmp.flush()
+            cmd = ["arvados-cwl-runner",
+                   "--submit",
+                   "--no-wait",
+                   "--project-uuid=%s" % project["uuid"],
+                   "arvwf:%s" % workflow_uuid,
+                   tmp.name]
+            logging.info("Running %s" % ' '.join(cmd))
+            comp = subprocess.run(cmd, capture_output=True)
+        logging.info("Submitted %s", comp.stdout)
+        if comp.returncode != 0:
+            logging.error(comp.stderr.decode('utf-8'))
+
+        return project
+
+
+    def start_fastq_to_fasta(self, collection,
+                             tgt,
+                             sample_id):
+
+        params = {
+            "metadata": {
+                "class": "File",
+                "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
+            },
+            "ref_fasta": {
+                "class": "File",
+                "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
+            },
+            "sample_id": sample_id
         }
 
-    newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", params)
-    api.collections().update(uuid=collection["uuid"],
-                             body={"owner_uuid": newproject["uuid"]}).execute()
-
-def start_pangenome_analysis(api,
-                             analysis_project,
-                             pangenome_workflow_uuid,
-                             validated_project,
-                             schema_ref,
-                             exclude_list):
-    validated = arvados.util.list_all(api.collections().list, filters=[
-        ["owner_uuid", "=", validated_project],
-        ["properties.status", "=", "validated"]])
-    inputobj = {
-        "inputReads": [],
-        "metadata": [],
-        "subjects": [],
-        "metadataSchema": {
-            "class": "File",
-            "location": schema_ref
-        },
-        "exclude": {
-            "class": "File",
-            "location": exclude_list
+        if tgt.startswith("reads.fastq"):
+            params["fastq_forward"] = {
+                "class": "File",
+                "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
+            }
+        elif tgt.startswith("reads_1.fastq"):
+            params["fastq_forward"] = {
+                "class": "File",
+                "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
+            }
+            params["fastq_reverse"] = {
+                "class": "File",
+                "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+            }
+
+        newproject = self.run_workflow(self.fastq_project, self.fastq_workflow_uuid, "FASTQ to FASTA", params)
+        self.api.collections().update(uuid=collection["uuid"],
+                                 body={"owner_uuid": newproject["uuid"]}).execute()
+
+    def start_pangenome_analysis(self):
+
+        if self.schema_ref is None:
+            self.upload_schema()
+
+        validated = arvados.util.list_all(self.api.collections().list, filters=[
+            ["owner_uuid", "=", self.validated_project],
+            ["properties.status", "=", "validated"]])
+        inputobj = {
+            "inputReads": [],
+            "metadata": [],
+            "subjects": [],
+            "metadataSchema": {
+                "class": "File",
+                "location": self.schema_ref
+            },
+            "exclude": {
+                "class": "File",
+                "location": self.exclude_list
+            }
         }
-    }
-    validated.sort(key=lambda v: v["portable_data_hash"])
-    for v in validated:
-        inputobj["inputReads"].append({
-            "class": "File",
-            "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
-        })
-        inputobj["metadata"].append({
-            "class": "File",
-            "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
-        })
-        inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
-    run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj)
-
-
-def get_workflow_output_from_project(api, uuid):
-    cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid],
-                                                ["requesting_container_uuid", "=", None]]).execute()
-    if cr["items"] and cr["items"][0]["output_uuid"]:
-        container = api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
-        if container["state"] == "Complete" and container["exit_code"] == 0:
-            return cr["items"][0]
-    return None
-
-
-def copy_most_recent_result(api, analysis_project, latest_result_uuid):
-    most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]],
-                                                  order="created_at desc").execute()
-    for m in most_recent_analysis["items"]:
-        wf = get_workflow_output_from_project(api, m["uuid"])
-        if wf:
-            src = api.collections().get(uuid=wf["output_uuid"]).execute()
-            dst = api.collections().get(uuid=latest_result_uuid).execute()
-            if src["portable_data_hash"] != dst["portable_data_hash"]:
-                logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid)
-                api.collections().update(uuid=latest_result_uuid,
-                                         body={"manifest_text": src["manifest_text"],
-                                               "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
-            break
-
+        validated.sort(key=lambda v: v["portable_data_hash"])
+        for v in validated:
+            inputobj["inputReads"].append({
+                "class": "File",
+                "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
+            })
+            inputobj["metadata"].append({
+                "class": "File",
+                "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
+            })
+            inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
+        self.run_workflow(self.pangenome_analysis_project, self.pangenome_workflow_uuid, "Pangenome analysis", inputobj)
+
+
+    def get_workflow_output_from_project(self, uuid):
+        cr = self.api.container_requests().list(filters=[['owner_uuid', '=', uuid],
+                                                    ["requesting_container_uuid", "=", None]]).execute()
+        if cr["items"] and cr["items"][0]["output_uuid"]:
+            container = self.api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
+            if container["state"] == "Complete" and container["exit_code"] == 0:
+                return cr["items"][0]
+        return None
+
+
+    def copy_most_recent_result(self):
+        most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]],
+                                                      order="created_at desc").execute()
+        for m in most_recent_analysis["items"]:
+            wf = self.get_workflow_output_from_project(m["uuid"])
+            if wf:
+                src = self.api.collections().get(uuid=wf["output_uuid"]).execute()
+                dst = self.api.collections().get(uuid=self.latest_result_uuid).execute()
+                if src["portable_data_hash"] != dst["portable_data_hash"]:
+                    logging.info("Copying latest result from '%s' to %s", m["name"], self.latest_result_uuid)
+                    self.api.collections().update(uuid=self.latest_result_uuid,
+                                             body={"manifest_text": src["manifest_text"],
+                                                   "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
+                break
+
+
+    def move_fastq_to_fasta_results(self):
+        projects = self.api.groups().list(filters=[['owner_uuid', '=', self.fastq_project],
+                                              ["properties.moved_output", "!=", True]],
+                                     order="created_at asc",).execute()
+        for p in projects["items"]:
+            wf = self.get_workflow_output_from_project(p["uuid"])
+            if not wf:
+                continue
 
-def move_fastq_to_fasta_results(api, analysis_project, uploader_project):
-    projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project],
-                                          ["properties.moved_output", "!=", True]],
-                                 order="created_at desc",).execute()
-    for p in projects["items"]:
-        wf = get_workflow_output_from_project(api, p["uuid"])
-        if wf:
             logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"])
-            api.collections().update(uuid=wf["output_uuid"],
-                                     body={"owner_uuid": uploader_project}).execute()
-            p["properties"]["moved_output"] = True
-            api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
-            break
 
+            col = arvados.collection.Collection(wf["output_uuid"], api_client=self.api, keep_client=self.keepclient)
+            with col.open("metadata.yaml") as md:
+                metadata_content = ruamel.yaml.round_trip_load(md)
+
+            colprop = col.get_properties()
+            colprop["sequence_label"] = metadata_content["sample"]["sample_id"]
+            self.api.collections().update(uuid=wf["output_uuid"],
+                                     body={"owner_uuid": self.uploader_project,
+                                           "properties": colprop}).execute()
 
-def upload_schema(api, workflow_def_project):
-    schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
-    c = arvados.collection.Collection()
-    with c.open("schema.yml", "wb") as f:
-        f.write(schema_resource.read())
-    pdh = c.portable_data_hash()
-    wd = api.collections().list(filters=[["owner_uuid", "=", workflow_def_project],
-                                         ["portable_data_hash", "=", pdh]]).execute()
-    if len(wd["items"]) == 0:
-        c.save_new(owner_uuid=workflow_def_project, name="Metadata schema", ensure_unique_name=True)
-    return "keep:%s/schema.yml" % pdh
-
-
-def print_status(api, uploader_project, fmt):
-    pending = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", uploader_project]])
-    out = []
-    status = {}
-    for p in pending:
-        prop = p["properties"]
-        out.append(prop)
-        if "status" not in prop:
-            prop["status"] = "pending"
-        prop["created_at"] = p["created_at"]
-        prop["uuid"] = p["uuid"]
-        status[prop["status"]] = status.get(prop["status"], 0) + 1
-    if fmt == "html":
-        print(
-"""
-<html>
-<body>
-""")
-        print("<p>Total collections in upload project %s</p>" % len(out))
-        print("<p>Status %s</p>" % status)
-        print(
-"""
-<table>
-<tr><th>Collection</th>
-<th>Sequence label</th>
-<th>Status</th>
-<th>Errors</th></tr>
-""")
-        for r in out:
-            print("<tr valign='top'>")
-            print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-            print("<td>%s</td>" % r["sequence_label"])
-            print("<td>%s</td>" % r["status"])
-            print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
-            print("</tr>")
-        print(
-"""
-</table>
-</body>
-</html>
-""")
-    else:
-        print(json.dumps(out, indent=2))
+            p["properties"]["moved_output"] = True
+            self.api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
+
+
+    def upload_schema(self):
+        schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
+        c = arvados.collection.Collection(api_client=self.api, keep_client=self.keepclient)
+        with c.open("schema.yml", "wb") as f:
+            f.write(schema_resource.read())
+        pdh = c.portable_data_hash()
+        wd = self.api.collections().list(filters=[["owner_uuid", "=", self.workflow_def_project],
+                                             ["portable_data_hash", "=", pdh]]).execute()
+        if len(wd["items"]) == 0:
+            c.save_new(owner_uuid=self.workflow_def_project, name="Metadata schema", ensure_unique_name=True)
+        self.schema_ref = "keep:%s/schema.yml" % pdh
+
+
+    def print_status(self, fmt):
+        pending = arvados.util.list_all(self.api.collections().list, filters=[["owner_uuid", "=", self.uploader_project]])
+        out = []
+        status = {}
+        for p in pending:
+            prop = p["properties"]
+            out.append(prop)
+            if "status" not in prop:
+                prop["status"] = "pending"
+            prop["created_at"] = p["created_at"]
+            prop["uuid"] = p["uuid"]
+            status[prop["status"]] = status.get(prop["status"], 0) + 1
+        if fmt == "html":
+            print(
+    """
+    <html>
+    <body>
+    """)
+            print("<p>Total collections in upload project %s</p>" % len(out))
+            print("<p>Status %s</p>" % status)
+            print(
+    """
+    <table>
+    <tr><th>Collection</th>
+    <th>Sequence label</th>
+    <th>Status</th>
+    <th>Errors</th></tr>
+    """)
+            for r in out:
+                print("<tr valign='top'>")
+                print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+                print("<td>%s</td>" % r["sequence_label"])
+                print("<td>%s</td>" % r["status"])
+                print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
+                print("</tr>")
+            print(
+    """
+    </table>
+    </body>
+    </html>
+    """)
+        else:
+            print(json.dumps(out, indent=2))
 
 def main():
     parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project')
@@ -310,50 +341,45 @@ def main():
     args = parser.parse_args()
 
     api = arvados.api()
-
-
-
-    schema_ref = upload_schema(api, args.workflow_def_project)
+    keepclient = arvados.keep.KeepClient(api_client=api)
+
+    seqanalyzer = SeqAnalyzer(api, keepclient,
+                              args.uploader_project,
+                              args.pangenome_analysis_project,
+                              args.fastq_project,
+                              args.validated_project,
+                              args.workflow_def_project,
+                              args.pangenome_workflow_uuid,
+                              args.fastq_workflow_uuid,
+                              args.exclude_list,
+                              args.latest_result_collection)
 
     if args.kickoff:
         logging.info("Starting a single analysis run")
-        start_pangenome_analysis(api,
-                                 args.pangenome_analysis_project,
-                                 args.pangenome_workflow_uuid,
-                                 args.validated_project,
-                                 schema_ref,
-                                 args.exclude_list)
+        seqanalyzer.start_pangenome_analysis()
         return
 
     if args.print_status:
-        print_status(api, args.uploader_project, args.print_status)
+        seqanalyzer.print_status(args.print_status)
         exit(0)
 
     logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project))
 
     while True:
-        move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project)
-
-        new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
-        at_least_one_new_valid_seq = False
-        for c in new_collections:
-            at_least_one_new_valid_seq = validate_upload(api, c,
-                                                         args.validated_project,
-                                                         args.fastq_project,
-                                                         args.fastq_workflow_uuid,
-                                                         args.revalidate) or at_least_one_new_valid_seq
-
-        if at_least_one_new_valid_seq and not args.no_start_analysis:
-            start_pangenome_analysis(api,
-                                     args.pangenome_analysis_project,
-                                     args.pangenome_workflow_uuid,
-                                     args.validated_project,
-                                     schema_ref,
-                                     args.exclude_list)
-
-        copy_most_recent_result(api,
-                                args.pangenome_analysis_project,
-                                args.latest_result_collection)
+        try:
+            seqanalyzer.move_fastq_to_fasta_results()
+
+            new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
+            at_least_one_new_valid_seq = False
+            for c in new_collections:
+                at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq
+
+            if at_least_one_new_valid_seq and not args.no_start_analysis:
+                seqanalyzer.start_pangenome_analysis()
+
+            seqanalyzer.copy_most_recent_result()
+        except Exception as e:
+            logging.exeception("Error in main loop")
 
         if args.once:
             break
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index ee852fa..0aead3b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -16,7 +16,7 @@ $graph:
   fields:
     license_type:
       doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
-      type: string?
+      type: string
       jsonldPredicate:
           _id: https://creativecommons.org/ns#License
     title:
@@ -264,7 +264,7 @@ $graph:
     virus: virusSchema
     technology: technologySchema
     submitter: submitterSchema
-    license: licenseSchema
+    license: ["null", licenseSchema]
     id:
       doc: The subject (eg the fasta/fastq file) that the metadata describes
       type: string
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 7331e86..bbc7309 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -17,7 +17,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
   MainSchema:submitter @:submitterShape ;
   MainSchema:technology @:technologyShape ;
   MainSchema:virus @:virusShape;
-  MainSchema:license @:licenseShape;
+  MainSchema:license @:licenseShape ?;
 }
 
 :hostShape  {
@@ -71,7 +71,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :licenseShape{
-    cc:License xsd:string ?;
+    cc:License xsd:string ;
     dc:Title xsd:string ?;
     cc:attributionName xsd:string ?;
     cc:attributionURL xsd:string ?;
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f744a8c..6049bf9 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
     try:
         log.debug("Checking metadata" if do_qc else "Skipping metadata check")
         if do_qc and not qc_metadata(metadata.name):
-            log.warning("Failed metadata qc")
+            log.warning("Failed metadata QC")
             failed = True
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed metadata QC")
         failed = True
 
     target = []
@@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
             target[0] = ("reads_1."+target[0][0][6:], target[0][1])
             target[1] = ("reads_2."+target[1][0][6:], target[0][1])
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed sequence QC")
         failed = True
 
     if failed:
@@ -82,7 +80,7 @@ def main():
     seqlabel = target[0][1]
 
     if args.validate:
-        print("Valid")
+        log.info("Valid")
         exit(0)
 
     col = arvados.collection.Collection(api_client=api)
@@ -91,10 +89,10 @@ def main():
     if args.sequence_p2:
         upload_sequence(col, target[1], args.sequence_p2)
 
-    print("Reading metadata")
+    log.info("Reading metadata")
     with col.open("metadata.yaml", "w") as f:
         r = args.metadata.read(65536)
-        print(r[0:20])
+        log.info(r[0:20])
         while r:
             f.write(r)
             r = args.metadata.read(65536)
@@ -118,7 +116,7 @@ def main():
                                            ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
     if dup["items"]:
         # This exact collection has been uploaded before.
-        print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+        log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
         exit(1)
 
     if args.trusted:
@@ -131,9 +129,9 @@ def main():
                  (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
-    print("Saved to %s" % col.manifest_locator())
-
-    print("Done")
+    log.info("Saved to %s" % col.manifest_locator())
+    log.info("Done")
+    exit(0)
 
 if __name__ == "__main__":
     main()
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index 8089883..7dd07fe 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -8,7 +8,7 @@ import os
 import sys
 import re
 import string
-import yaml
+import ruamel.yaml as yaml
 import pkg_resources
 from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
 import os.path
@@ -16,6 +16,9 @@ import requests
 import io
 import arvados
 from markupsafe import Markup
+from schema_salad.sourceline import add_lc_filename
+from schema_salad.schema import shortname
+from typing import MutableSequence, MutableMapping
 
 ARVADOS_API = 'lugli.arvadosapi.com'
 ANONYMOUS_TOKEN = '5o42qdxpxp5cj15jqjf7vnxx5xduhm4ret703suuoa3ivfglfh'
@@ -47,6 +50,8 @@ def type_to_heading(type_name):
     Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading.
     """
 
+    type_name = shortname(type_name)
+
     print(type_name,file=sys.stderr)
     # Remove camel case
     decamel = re.sub('([A-Z])', r' \1', type_name)
@@ -78,7 +83,7 @@ def is_iri(string):
 
     return string.startswith('http')
 
-def generate_form(schema, options):
+def generate_form(components, options):
     """
     Linearize the schema into a list of dicts.
 
@@ -101,9 +106,6 @@ def generate_form(schema, options):
     IRI.
     """
 
-    # Get the list of form components, one of which is the root
-    components = schema.get('$graph', [])
-
     # Find the root
     root_name = None
     # And also index components by type name
@@ -131,55 +133,54 @@ def generate_form(schema, options):
             # First make a heading, if we aren't the very root of the form
             yield {'heading': type_to_heading(type_name)}
 
-        for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items():
+        for field in by_name.get(type_name, {}).get('fields', []):
+            field_name = shortname(field["name"])
+            field_type = field["type"]
             # For each field
 
             ref_iri = None
             docstring = None
-            if not isinstance(field_type, str):
-                # If the type isn't a string
-
-                # It may have documentation
-                docstring = field_type.get('doc', None)
-
-                # See if it has a more info/what goes here URL
-                predicate = field_type.get('jsonldPredicate', {})
-                # Predicate may be a URL, a dict with a URL in _id, maybe a
-                # dict with a URL in _type, or a dict with _id and _type but no
-                # URLs anywhere. Some of these may not technically be allowed
-                # by the format, but if they occur, we might as well try to
-                # handle them.
-                if isinstance(predicate, str):
-                    if is_iri(predicate):
-                        ref_iri = predicate
-                else:
-                    # Assume it's a dict. Look at the fields we know about.
-                    for field in ['_id', 'type']:
-                        field_value = predicate.get(field, None)
-                        if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
-                            # Take the first URL-looking thing we find
-                            ref_iri = field_value
-                            break
 
-
-                # Now overwrite the field type with the actual type string
-                field_type = field_type.get('type', '')
-
-            # Decide if the field is optional (type ends in ?)
             optional = False
-            if field_type.endswith('?'):
-                # It's optional
-                optional = True
-                # Drop the ?
-                field_type = field_type[:-1]
-
-            # Decide if the field is a list (type ends in [])
             is_list = False
-            if field_type.endswith('[]'):
-                # It's a list
-                is_list = True
-                # Reduce to the normal type
-                field_type = field_type[:-2]
+
+            # It may have documentation
+            docstring = field.get('doc', None)
+
+            # See if it has a more info/what goes here URL
+            predicate = field.get('jsonldPredicate', {})
+            # Predicate may be a URL, a dict with a URL in _id, maybe a
+            # dict with a URL in _type, or a dict with _id and _type but no
+            # URLs anywhere. Some of these may not technically be allowed
+            # by the format, but if they occur, we might as well try to
+            # handle them.
+            if isinstance(predicate, str):
+                if is_iri(predicate):
+                    ref_iri = predicate
+            else:
+                # Assume it's a dict. Look at the fields we know about.
+                for field in ['_id', 'type']:
+                    field_value = predicate.get(field, None)
+                    if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
+                        # Take the first URL-looking thing we find
+                        ref_iri = field_value
+                        break
+
+            if isinstance(field_type, MutableSequence):
+                if field_type[0] == "null" and len(field_type) == 2:
+                    optional = True
+                    field_type = field_type[1]
+                else:
+                    raise Exception("Can't handle it")
+
+            if isinstance(field_type, MutableMapping):
+                if field_type["type"] == "array":
+                    # Now replace the field type with the actual type string
+                    is_list = True
+                    field_type = field_type.get('items', '')
+                else:
+                    field_type = field_type.get('type', '')
+                    pass
 
             if field_type in by_name:
                 # This is a subrecord. We need to recurse
@@ -227,15 +228,24 @@ def generate_form(schema, options):
     return list(walk_fields(root_name))
 
 
-# At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
-if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
-    METADATA_SCHEMA = yaml.safe_load(open("bh20sequploader/bh20seq-schema.yml","r").read())
-    METADATA_OPTION_DEFINITIONS = yaml.safe_load(open("bh20sequploader/bh20seq-options.yml","r").read())
-else:
-    METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
-    METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
-# print(METADATA_SCHEMA,file=sys.stderr)
-FORM_ITEMS = generate_form(METADATA_SCHEMA, METADATA_OPTION_DEFINITIONS)
+import schema_salad.schema
+def load_schema_generate_form():
+    # At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
+    if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
+        METADATA_SCHEMA = yaml.round_trip_load(open("bh20sequploader/bh20seq-schema.yml","r").read())
+        METADATA_OPTION_DEFINITIONS = yaml.safe_load(open("bh20sequploader/bh20seq-options.yml","r").read())
+    else:
+        METADATA_SCHEMA = yaml.round_trip_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
+        METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
+
+    METADATA_SCHEMA["name"] = "bh20seq-schema.yml"
+    add_lc_filename(METADATA_SCHEMA, "bh20seq-schema.yml")
+    metaschema_names, _metaschema_doc, metaschema_loader = schema_salad.schema.get_metaschema()
+    schema_doc, schema_metadata = metaschema_loader.resolve_ref(METADATA_SCHEMA, "")
+
+    return generate_form(schema_doc, METADATA_OPTION_DEFINITIONS)
+
+FORM_ITEMS = load_schema_generate_form()
 
 @app.route('/')
 def send_home():
@@ -435,12 +445,12 @@ def receive_files():
 
         if result.returncode != 0:
             # It didn't work. Complain.
-            error_message="Uploader returned value {} and said:".format(result.returncode) + str(result.stderr.decode('utf-8'))
+            error_message="Uploader returned value {} and said:\n".format(result.returncode) + str(result.stderr.decode('utf-8'))
             print(error_message, file=sys.stderr)
             return (render_template('error.html', error_message=error_message), 403)
         else:
             # It worked. Say so.
-            return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace'))
+            return render_template('success.html', log=result.stderr.decode('utf-8', errors='replace'))
     finally:
         shutil.rmtree(dest_dir)
 
@@ -479,10 +489,13 @@ def pending_table(output, items):
     for r in items:
         if r["status"] != "pending":
             continue
-        output.write("<tr>")
-        output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-        output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
-        output.write("</tr>")
+        try:
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+            output.write("</tr>")
+        except:
+            pass
     output.write(
 """
 </table>
@@ -497,18 +510,69 @@ def rejected_table(output, items):
 <th>Errors</th></tr>
 """)
     for r in items:
-        if r["status"] != "rejected":
-            continue
+        try:
+            if r["status"] != "rejected":
+                continue
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+            output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+            output.write("</tr>")
+        except:
+            pass
+    output.write(
+"""
+</table>
+""")
+
+def workflows_table(output, items):
+    output.write(
+"""
+<table>
+<tr>
+<th>Name</th>
+<th>Sample id</th>
+<th>Started</th>
+<th>Container request</th>
+</tr>
+""")
+    for r in items:
         output.write("<tr>")
-        output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-        output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
-        output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+        try:
+            sid = r["mounts"]["/var/lib/cwl/cwl.input.json"]["content"]["sample_id"]
+            output.write("<td>%s</td>" % Markup.escape(r["name"]))
+            output.write("<td>%s</td>" % Markup.escape(sid))
+            output.write("<td>%s</td>" % Markup.escape(r["created_at"]))
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/container_requests/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+        except:
+            pass
         output.write("</tr>")
     output.write(
 """
 </table>
 """)
 
+def validated_table(output, items):
+    output.write(
+"""
+<table>
+<tr>
+<th>Collection</th>
+<th>Sequence label</th>
+</tr>
+""")
+    for r in items:
+        try:
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r["properties"].get("sequence_label")))
+            output.write("</tr>")
+        except:
+            pass
+    output.write(
+"""
+</table>
+""")
 
 @app.route('/status')
 def status_page():
@@ -529,22 +593,39 @@ def status_page():
         prop["uuid"] = p["uuid"]
         status[prop["status"]] = status.get(prop["status"], 0) + 1
 
+    workflows = arvados.util.list_all(api.container_requests().list,
+                                      filters=[["name", "in", ["fastq2fasta.cwl"]], ["state", "=", "Committed"]],
+                                      order="created_at asc")
+
     output = io.StringIO()
 
     validated = api.collections().list(filters=[["owner_uuid", "=", VALIDATED_PROJECT]], limit=1).execute()
     status["passed"] = validated["items_available"]
 
-    for s in (("passed", "/download"), ("pending", "#pending"), ("rejected", "#rejected")):
+    for s in (("passed", "/validated"), ("pending", "#pending"), ("rejected", "#rejected")):
         output.write("<p><a href='%s'>%s sequences QC %s</a></p>" % (s[1], status.get(s[0], 0), s[0]))
 
-    output.write("<a id='pending'><h1>Pending</h1>")
+    output.write("<p><a href='%s'>%s analysis workflows running</a></p>" % ('#workflows', len(workflows)))
+
+    output.write("<a id='pending'><h1>Pending</h1></a>")
     pending_table(output, out)
 
-    output.write("<a id='rejected'><h1>Rejected</h1>")
+    output.write("<a id='rejected'><h1>Rejected</h1></a>")
     rejected_table(output, out)
 
+    output.write("<a id='workflows'><h1>Running Workflows</h1></a>")
+    workflows_table(output, workflows)
+
     return render_template('status.html', table=Markup(output.getvalue()), menu='STATUS')
 
+@app.route('/validated')
+def validated_page():
+    api = arvados.api(host=ARVADOS_API, token=ANONYMOUS_TOKEN, insecure=True)
+    output = io.StringIO()
+    validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", VALIDATED_PROJECT]])
+    validated_table(output, validated)
+    return render_template('validated.html', table=Markup(output.getvalue()), menu='STATUS')
+
 @app.route('/demo')
 def demo_page():
     return render_template('demo.html',menu='DEMO')
@@ -569,7 +650,6 @@ def map_page():
     return render_template('map.html',menu='DEMO')
 
 
-
 ## Dynamic API functions starting here
 ## This is quick and dirty for now, just to get something out and demonstrate the queries
 ## Feel free to rename the functions/endpoints, feel free to process result so we get nicer JSON
diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css
index 47fb408..b28ee9c 100644
--- a/bh20simplewebuploader/static/main.css
+++ b/bh20simplewebuploader/static/main.css
@@ -178,7 +178,7 @@ span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; }
 
 .about {
     display: grid;
-    grid-template-columns: repeat(2, 1fr);
+    grid-template-columns: 1fr 1fr;
     grid-auto-flow: row;
 }
 
@@ -229,7 +229,7 @@ a {
 #metadata_fill_form {
     column-count: 4;
     margin-top: 0.5em;
-    column-width: 250px;
+    column-width: 15em;
 }
 
 .record, .record .field-group, .record .field-group .field {
@@ -238,6 +238,8 @@ a {
     -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */
     page-break-inside: avoid; /* Firefox */
     break-inside: avoid;
+    display: block;
+    width: 90%;
 }
 
 .record {
@@ -258,6 +260,10 @@ a {
     width: max-content;
 }
 
+.control {
+    width: 100%;
+}
+
 .filter-options {
     width: 100%;
 }
@@ -304,9 +310,10 @@ footer {
 }
 
 .sponsors img {
-    width: 80%;
-    display:block;
-    margin:auto;
+    width: auto;
+    display: block;
+    margin: auto;
+    height: 4em;
 }
 
 .loader {
diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html
index b1d9402..fc08aed 100644
--- a/bh20simplewebuploader/templates/error.html
+++ b/bh20simplewebuploader/templates/error.html
@@ -15,7 +15,7 @@
           </pre>
         </p>
         <p>
-            <a href="/">Click here to try again.</a>
+            <a href="/upload">Click here to try again.</a>
         </p>
         <hr>
     </body>
diff --git a/bh20simplewebuploader/templates/footer.html b/bh20simplewebuploader/templates/footer.html
index 37a6b64..5a1f3c9 100644
--- a/bh20simplewebuploader/templates/footer.html
+++ b/bh20simplewebuploader/templates/footer.html
@@ -21,12 +21,15 @@
           <img src="static/image/covid19biohackathon.png"></a>
       </div>
       <div class="sponsorimg">
-        <a href="https://www.commonwl.org/"><img src="static/image/CWL.png"></a>
+        <a href="https://www.curii.com/"><img src="static/image/curii.logo.ai.svg"></a>
       </div>
       <div class="sponsorimg">
         <a href="https://arvados.org/"><img src="static/image/arvados-logo.png"></a>
       </div>
       <div class="sponsorimg">
+        <a href="https://www.commonwl.org/"><img src="static/image/CWL.png"></a>
+      </div>
+      <div class="sponsorimg">
         <a href="https://uthsc.edu/"><img src="static/image/UTHSC-primary-stacked-logo-4c.png"></a>
       </div>
       <div class="sponsorimg">
diff --git a/bh20simplewebuploader/templates/status.html b/bh20simplewebuploader/templates/status.html
index a1cf28f..e89437e 100644
--- a/bh20simplewebuploader/templates/status.html
+++ b/bh20simplewebuploader/templates/status.html
@@ -7,7 +7,8 @@
 
       <h1>Sequence upload processing status</h1>
 
-        <div class="status">
+      <div class="status">
+
 	  {{ table }}
         </div>
 
diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html
index 9f0987c..c2302fa 100644
--- a/bh20simplewebuploader/templates/success.html
+++ b/bh20simplewebuploader/templates/success.html
@@ -9,7 +9,7 @@
         <h1>Upload Successful</h1>
         <hr>
         <p>
-            Your files have been uploaded. They should soon appear as output of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
+            Your files have been uploaded. You can track their <a href="/status">QC status</a>, once validated they will be part of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
         </p>
         <p>
             The upload log was:
diff --git a/bh20simplewebuploader/templates/validated.html b/bh20simplewebuploader/templates/validated.html
new file mode 100644
index 0000000..cee94bd
--- /dev/null
+++ b/bh20simplewebuploader/templates/validated.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+  {% include 'header.html' %}
+    <body>
+      {% include 'banner.html' %}
+      {% include 'menu.html' %}
+
+      <h1>Validated sequences</h1>
+
+        <div class="status">
+	  {{ table }}
+        </div>
+
+{% include 'footer.html' %}
+
+   </body>
+</html>
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
new file mode 100644
index 0000000..6a82659
--- /dev/null
+++ b/scripts/cleanup.py
@@ -0,0 +1,38 @@
+import arvados
+import arvados.util
+
+api = arvados.api()
+
+delete_patterns = [
+    "%missing%`collection_location`%",
+    "%missing%`technology`%",
+    "%missing%`host_species`%",
+    "%QC fail: alignment%",
+    "%does not look like a valid URI%",
+    "%Duplicate of%"
+    ]
+
+revalidate_patterns = [
+    "%missing%`license`%"
+]
+
+for p in delete_patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        api.collections().delete(uuid=i["uuid"]).execute()
+
+for p in revalidate_patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        pr = i["properties"]
+        if "status" in pr:
+            del pr["status"]
+        if "errors" in pr:
+            del pr["errors"]
+        api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()