Merge branch 'master' into ebi-submit

author: Pjotr Prins 2020-07-17 11:08:15 +0100
committer: Pjotr Prins 2020-07-17 11:08:15 +0100
commit: 16bb5df907c79cd0ce6bea0015821a2ce51fb992 (patch)
tree: ddb9677cddcc463bb514300189cbd4300b9117ed
parent: 0be9983ef88fd3b925d8fa53e7f9ab2a28703bc0 (diff)
parent: c69046ee9a5e24eadcd8cb885633328b0fd88011 (diff)
download: bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.gz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.lz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.zip
43 files changed, 1227 insertions, 785 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 0b52e6b..b3a439d 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -16,277 +16,308 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%
                     level=logging.INFO)
 logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN)
 
-def validate_upload(api, collection, validated_project,
-                    fastq_project, fastq_workflow_uuid,
-                    revalidate):
-    col = arvados.collection.Collection(collection["uuid"])
-
-    if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
-        return False
-
-    # validate the collection here.  Check metadata, etc.
-    logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
-
-    errors = []
-
-    if collection["owner_uuid"] != validated_project:
-        dup = api.collections().list(filters=[["owner_uuid", "=", validated_project],
-                                              ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
-        if dup["items"]:
-            # This exact collection has been uploaded before.
-            errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
-
-    if not errors:
-        if "metadata.yaml" not in col:
-            errors.append("Missing metadata.yaml", collection["name"])
-        else:
+class SeqAnalyzer:
+
+    def __init__(self, api, keepclient,
+                 uploader_project,
+                 pangenome_analysis_project,
+                 fastq_project,
+                 validated_project,
+                 workflow_def_project,
+                 pangenome_workflow_uuid,
+                 fastq_workflow_uuid,
+                 exclude_list,
+                 latest_result_collection):
+        self.api = api
+        self.keepclient = keepclient
+        self.uploader_project = uploader_project
+        self.pangenome_analysis_project = pangenome_analysis_project
+        self.fastq_project = fastq_project
+        self.validated_project = validated_project
+        self.workflow_def_project = workflow_def_project
+        self.pangenome_workflow_uuid = pangenome_workflow_uuid
+        self.fastq_workflow_uuid = fastq_workflow_uuid
+        self.exclude_list = exclude_list
+        self.latest_result_uuid = latest_result_collection
+        self.schema_ref = None
+
+    def validate_upload(self, collection, revalidate):
+        col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient)
+
+        if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
+            return False
+
+        # validate the collection here.  Check metadata, etc.
+        logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
+
+        errors = []
+
+        if collection["owner_uuid"] != self.validated_project:
+            dup = self.api.collections().list(filters=[["owner_uuid", "=", self.validated_project],
+                                                  ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
+            if dup["items"]:
+                # This exact collection has been uploaded before.
+                errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+
+        if not errors:
+            if "metadata.yaml" not in col:
+                errors.append("Missing metadata.yaml", collection["name"])
+            else:
+                try:
+                    with col.open("metadata.yaml") as md:
+                        metadata_content = ruamel.yaml.round_trip_load(md)
+                    metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
+                    sample_id = metadata_content["sample"]["sample_id"]
+                    add_lc_filename(metadata_content, metadata_content["id"])
+                    valid = qc_metadata(metadata_content)
+                    if not valid:
+                        errors.append("Failed metadata qc")
+                except Exception as e:
+                    errors.append(str(e))
+
+        if not errors:
             try:
-                metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml"))
-                metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
-                sample_id = metadata_content["sample"]["sample_id"]
-                add_lc_filename(metadata_content, metadata_content["id"])
-                valid = qc_metadata(metadata_content)
-                if not valid:
-                    errors.append("Failed metadata qc")
-            except Exception as e:
-                errors.append(str(e))
-
-    if not errors:
-        try:
-            tgt = None
-            paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
-            for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
-                if n not in col:
-                    continue
-                with col.open(n, 'rb') as qf:
-                    tgt = qc_fasta(qf)[0]
-                    if tgt != n and tgt != paired.get(n):
-                        errors.append("Expected %s but magic says it should be %s", n, tgt)
-                    elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
-                        start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id)
-                        return False
-            if tgt is None:
-                errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
-        except Exception as v:
-            errors.append(str(v))
-
-
-    if not errors:
-        # Move it to the "validated" project to be included in the next analysis
-        if "errors" in collection["properties"]:
-            del collection["properties"]["errors"]
-        collection["properties"]["status"] = "validated"
-        api.collections().update(uuid=collection["uuid"], body={
-            "owner_uuid": validated_project,
-            "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
-            "properties": collection["properties"]}).execute()
-        logging.info("Added '%s' to validated sequences" % collection["name"])
-        return True
-    else:
-        # It is invalid
-        logging.warn("'%s' (%s) has validation errors: %s" % (
-            collection["name"], collection["uuid"], "\n".join(errors)))
-        collection["properties"]["status"] = "rejected"
-        collection["properties"]["errors"] = errors
-        api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
-        return False
-
-
-def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
-    project = api.groups().create(body={
-        "group_class": "project",
-        "name": name,
-        "owner_uuid": parent_project,
-    }, ensure_unique_name=True).execute()
-
-    with tempfile.NamedTemporaryFile() as tmp:
-        tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
-        tmp.flush()
-        cmd = ["arvados-cwl-runner",
-               "--submit",
-               "--no-wait",
-               "--project-uuid=%s" % project["uuid"],
-               "arvwf:%s" % workflow_uuid,
-               tmp.name]
-        logging.info("Running %s" % ' '.join(cmd))
-        comp = subprocess.run(cmd, capture_output=True)
-    logging.info("Submitted %s", comp.stdout)
-    if comp.returncode != 0:
-        logging.error(comp.stderr.decode('utf-8'))
-
-    return project
-
-
-def start_fastq_to_fasta(api, collection,
-                         analysis_project,
-                         fastq_workflow_uuid,
-                         tgt,
-                         sample_id):
-
-    params = {
-        "metadata": {
-            "class": "File",
-            "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
-        },
-        "ref_fasta": {
-            "class": "File",
-            "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
-        },
-        "sample_id": sample_id
-    }
-
-    if tgt.startswith("reads.fastq"):
-        params["fastq_forward"] = {
-            "class": "File",
-            "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
-        }
-    elif tgt.startswith("reads_1.fastq"):
-        params["fastq_forward"] = {
-            "class": "File",
-            "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
-        }
-        params["fastq_reverse"] = {
-            "class": "File",
-            "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+                tgt = None
+                paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
+                for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                    if n not in col:
+                        continue
+                    with col.open(n, 'rb') as qf:
+                        tgt = qc_fasta(qf)[0]
+                        if tgt != n and tgt != paired.get(n):
+                            errors.append("Expected %s but magic says it should be %s", n, tgt)
+                        elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+                            self.start_fastq_to_fasta(collection, n, sample_id)
+                            return False
+                if tgt is None:
+                    errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
+            except Exception as v:
+                errors.append(str(v))
+
+
+        if not errors:
+            # Move it to the "validated" project to be included in the next analysis
+            if "errors" in collection["properties"]:
+                del collection["properties"]["errors"]
+            collection["properties"]["status"] = "validated"
+            self.api.collections().update(uuid=collection["uuid"], body={
+                "owner_uuid": self.validated_project,
+                "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
+                "properties": collection["properties"]}).execute()
+            logging.info("Added '%s' to validated sequences" % collection["name"])
+            return True
+        else:
+            # It is invalid
+            logging.warn("'%s' (%s) has validation errors: %s" % (
+                collection["name"], collection["uuid"], "\n".join(errors)))
+            collection["properties"]["status"] = "rejected"
+            collection["properties"]["errors"] = errors
+            self.api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
+            return False
+
+
+    def run_workflow(self, parent_project, workflow_uuid, name, inputobj):
+        project = self.api.groups().create(body={
+            "group_class": "project",
+            "name": name,
+            "owner_uuid": parent_project,
+        }, ensure_unique_name=True).execute()
+
+        with tempfile.NamedTemporaryFile() as tmp:
+            tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
+            tmp.flush()
+            cmd = ["arvados-cwl-runner",
+                   "--submit",
+                   "--no-wait",
+                   "--project-uuid=%s" % project["uuid"],
+                   "arvwf:%s" % workflow_uuid,
+                   tmp.name]
+            logging.info("Running %s" % ' '.join(cmd))
+            comp = subprocess.run(cmd, capture_output=True)
+        logging.info("Submitted %s", comp.stdout)
+        if comp.returncode != 0:
+            logging.error(comp.stderr.decode('utf-8'))
+
+        return project
+
+
+    def start_fastq_to_fasta(self, collection,
+                             tgt,
+                             sample_id):
+
+        params = {
+            "metadata": {
+                "class": "File",
+                "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
+            },
+            "ref_fasta": {
+                "class": "File",
+                "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
+            },
+            "sample_id": sample_id
         }
 
-    newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", params)
-    api.collections().update(uuid=collection["uuid"],
-                             body={"owner_uuid": newproject["uuid"]}).execute()
-
-def start_pangenome_analysis(api,
-                             analysis_project,
-                             pangenome_workflow_uuid,
-                             validated_project,
-                             schema_ref,
-                             exclude_list):
-    validated = arvados.util.list_all(api.collections().list, filters=[
-        ["owner_uuid", "=", validated_project],
-        ["properties.status", "=", "validated"]])
-    inputobj = {
-        "inputReads": [],
-        "metadata": [],
-        "subjects": [],
-        "metadataSchema": {
-            "class": "File",
-            "location": schema_ref
-        },
-        "exclude": {
-            "class": "File",
-            "location": exclude_list
+        if tgt.startswith("reads.fastq"):
+            params["fastq_forward"] = {
+                "class": "File",
+                "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
+            }
+        elif tgt.startswith("reads_1.fastq"):
+            params["fastq_forward"] = {
+                "class": "File",
+                "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
+            }
+            params["fastq_reverse"] = {
+                "class": "File",
+                "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+            }
+
+        newproject = self.run_workflow(self.fastq_project, self.fastq_workflow_uuid, "FASTQ to FASTA", params)
+        self.api.collections().update(uuid=collection["uuid"],
+                                 body={"owner_uuid": newproject["uuid"]}).execute()
+
+    def start_pangenome_analysis(self):
+
+        if self.schema_ref is None:
+            self.upload_schema()
+
+        validated = arvados.util.list_all(self.api.collections().list, filters=[
+            ["owner_uuid", "=", self.validated_project],
+            ["properties.status", "=", "validated"]])
+        inputobj = {
+            "inputReads": [],
+            "metadata": [],
+            "subjects": [],
+            "metadataSchema": {
+                "class": "File",
+                "location": self.schema_ref
+            },
+            "exclude": {
+                "class": "File",
+                "location": self.exclude_list
+            }
         }
-    }
-    validated.sort(key=lambda v: v["portable_data_hash"])
-    for v in validated:
-        inputobj["inputReads"].append({
-            "class": "File",
-            "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
-        })
-        inputobj["metadata"].append({
-            "class": "File",
-            "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
-        })
-        inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
-    run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj)
-
-
-def get_workflow_output_from_project(api, uuid):
-    cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid],
-                                                ["requesting_container_uuid", "=", None]]).execute()
-    if cr["items"] and cr["items"][0]["output_uuid"]:
-        container = api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
-        if container["state"] == "Complete" and container["exit_code"] == 0:
-            return cr["items"][0]
-    return None
-
-
-def copy_most_recent_result(api, analysis_project, latest_result_uuid):
-    most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]],
-                                                  order="created_at desc").execute()
-    for m in most_recent_analysis["items"]:
-        wf = get_workflow_output_from_project(api, m["uuid"])
-        if wf:
-            src = api.collections().get(uuid=wf["output_uuid"]).execute()
-            dst = api.collections().get(uuid=latest_result_uuid).execute()
-            if src["portable_data_hash"] != dst["portable_data_hash"]:
-                logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid)
-                api.collections().update(uuid=latest_result_uuid,
-                                         body={"manifest_text": src["manifest_text"],
-                                               "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
-            break
-
+        validated.sort(key=lambda v: v["portable_data_hash"])
+        for v in validated:
+            inputobj["inputReads"].append({
+                "class": "File",
+                "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
+            })
+            inputobj["metadata"].append({
+                "class": "File",
+                "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
+            })
+            inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
+        self.run_workflow(self.pangenome_analysis_project, self.pangenome_workflow_uuid, "Pangenome analysis", inputobj)
+
+
+    def get_workflow_output_from_project(self, uuid):
+        cr = self.api.container_requests().list(filters=[['owner_uuid', '=', uuid],
+                                                    ["requesting_container_uuid", "=", None]]).execute()
+        if cr["items"] and cr["items"][0]["output_uuid"]:
+            container = self.api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
+            if container["state"] == "Complete" and container["exit_code"] == 0:
+                return cr["items"][0]
+        return None
+
+
+    def copy_most_recent_result(self):
+        most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]],
+                                                      order="created_at desc").execute()
+        for m in most_recent_analysis["items"]:
+            wf = self.get_workflow_output_from_project(m["uuid"])
+            if wf:
+                src = self.api.collections().get(uuid=wf["output_uuid"]).execute()
+                dst = self.api.collections().get(uuid=self.latest_result_uuid).execute()
+                if src["portable_data_hash"] != dst["portable_data_hash"]:
+                    logging.info("Copying latest result from '%s' to %s", m["name"], self.latest_result_uuid)
+                    self.api.collections().update(uuid=self.latest_result_uuid,
+                                             body={"manifest_text": src["manifest_text"],
+                                                   "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
+                break
+
+
+    def move_fastq_to_fasta_results(self):
+        projects = self.api.groups().list(filters=[['owner_uuid', '=', self.fastq_project],
+                                              ["properties.moved_output", "!=", True]],
+                                     order="created_at asc",).execute()
+        for p in projects["items"]:
+            wf = self.get_workflow_output_from_project(p["uuid"])
+            if not wf:
+                continue
 
-def move_fastq_to_fasta_results(api, analysis_project, uploader_project):
-    projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project],
-                                          ["properties.moved_output", "!=", True]],
-                                 order="created_at desc",).execute()
-    for p in projects["items"]:
-        wf = get_workflow_output_from_project(api, p["uuid"])
-        if wf:
             logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"])
-            api.collections().update(uuid=wf["output_uuid"],
-                                     body={"owner_uuid": uploader_project}).execute()
-            p["properties"]["moved_output"] = True
-            api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
-            break
 
+            col = arvados.collection.Collection(wf["output_uuid"], api_client=self.api, keep_client=self.keepclient)
+            with col.open("metadata.yaml") as md:
+                metadata_content = ruamel.yaml.round_trip_load(md)
+
+            colprop = col.get_properties()
+            colprop["sequence_label"] = metadata_content["sample"]["sample_id"]
+            self.api.collections().update(uuid=wf["output_uuid"],
+                                     body={"owner_uuid": self.uploader_project,
+                                           "properties": colprop}).execute()
 
-def upload_schema(api, workflow_def_project):
-    schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
-    c = arvados.collection.Collection()
-    with c.open("schema.yml", "wb") as f:
-        f.write(schema_resource.read())
-    pdh = c.portable_data_hash()
-    wd = api.collections().list(filters=[["owner_uuid", "=", workflow_def_project],
-                                         ["portable_data_hash", "=", pdh]]).execute()
-    if len(wd["items"]) == 0:
-        c.save_new(owner_uuid=workflow_def_project, name="Metadata schema", ensure_unique_name=True)
-    return "keep:%s/schema.yml" % pdh
-
-
-def print_status(api, uploader_project, fmt):
-    pending = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", uploader_project]])
-    out = []
-    status = {}
-    for p in pending:
-        prop = p["properties"]
-        out.append(prop)
-        if "status" not in prop:
-            prop["status"] = "pending"
-        prop["created_at"] = p["created_at"]
-        prop["uuid"] = p["uuid"]
-        status[prop["status"]] = status.get(prop["status"], 0) + 1
-    if fmt == "html":
-        print(
-"""
-<html>
-<body>
-""")
-        print("<p>Total collections in upload project %s</p>" % len(out))
-        print("<p>Status %s</p>" % status)
-        print(
-"""
-<table>
-<tr><th>Collection</th>
-<th>Sequence label</th>
-<th>Status</th>
-<th>Errors</th></tr>
-""")
-        for r in out:
-            print("<tr valign='top'>")
-            print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-            print("<td>%s</td>" % r["sequence_label"])
-            print("<td>%s</td>" % r["status"])
-            print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
-            print("</tr>")
-        print(
-"""
-</table>
-</body>
-</html>
-""")
-    else:
-        print(json.dumps(out, indent=2))
+            p["properties"]["moved_output"] = True
+            self.api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
+
+
+    def upload_schema(self):
+        schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
+        c = arvados.collection.Collection(api_client=self.api, keep_client=self.keepclient)
+        with c.open("schema.yml", "wb") as f:
+            f.write(schema_resource.read())
+        pdh = c.portable_data_hash()
+        wd = self.api.collections().list(filters=[["owner_uuid", "=", self.workflow_def_project],
+                                             ["portable_data_hash", "=", pdh]]).execute()
+        if len(wd["items"]) == 0:
+            c.save_new(owner_uuid=self.workflow_def_project, name="Metadata schema", ensure_unique_name=True)
+        self.schema_ref = "keep:%s/schema.yml" % pdh
+
+
+    def print_status(self, fmt):
+        pending = arvados.util.list_all(self.api.collections().list, filters=[["owner_uuid", "=", self.uploader_project]])
+        out = []
+        status = {}
+        for p in pending:
+            prop = p["properties"]
+            out.append(prop)
+            if "status" not in prop:
+                prop["status"] = "pending"
+            prop["created_at"] = p["created_at"]
+            prop["uuid"] = p["uuid"]
+            status[prop["status"]] = status.get(prop["status"], 0) + 1
+        if fmt == "html":
+            print(
+    """
+    <html>
+    <body>
+    """)
+            print("<p>Total collections in upload project %s</p>" % len(out))
+            print("<p>Status %s</p>" % status)
+            print(
+    """
+    <table>
+    <tr><th>Collection</th>
+    <th>Sequence label</th>
+    <th>Status</th>
+    <th>Errors</th></tr>
+    """)
+            for r in out:
+                print("<tr valign='top'>")
+                print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+                print("<td>%s</td>" % r["sequence_label"])
+                print("<td>%s</td>" % r["status"])
+                print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
+                print("</tr>")
+            print(
+    """
+    </table>
+    </body>
+    </html>
+    """)
+        else:
+            print(json.dumps(out, indent=2))
 
 def main():
     parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project')
@@ -310,50 +341,45 @@ def main():
     args = parser.parse_args()
 
     api = arvados.api()
-
-
-
-    schema_ref = upload_schema(api, args.workflow_def_project)
+    keepclient = arvados.keep.KeepClient(api_client=api)
+
+    seqanalyzer = SeqAnalyzer(api, keepclient,
+                              args.uploader_project,
+                              args.pangenome_analysis_project,
+                              args.fastq_project,
+                              args.validated_project,
+                              args.workflow_def_project,
+                              args.pangenome_workflow_uuid,
+                              args.fastq_workflow_uuid,
+                              args.exclude_list,
+                              args.latest_result_collection)
 
     if args.kickoff:
         logging.info("Starting a single analysis run")
-        start_pangenome_analysis(api,
-                                 args.pangenome_analysis_project,
-                                 args.pangenome_workflow_uuid,
-                                 args.validated_project,
-                                 schema_ref,
-                                 args.exclude_list)
+        seqanalyzer.start_pangenome_analysis()
         return
 
     if args.print_status:
-        print_status(api, args.uploader_project, args.print_status)
+        seqanalyzer.print_status(args.print_status)
         exit(0)
 
     logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project))
 
     while True:
-        move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project)
-
-        new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
-        at_least_one_new_valid_seq = False
-        for c in new_collections:
-            at_least_one_new_valid_seq = validate_upload(api, c,
-                                                         args.validated_project,
-                                                         args.fastq_project,
-                                                         args.fastq_workflow_uuid,
-                                                         args.revalidate) or at_least_one_new_valid_seq
-
-        if at_least_one_new_valid_seq and not args.no_start_analysis:
-            start_pangenome_analysis(api,
-                                     args.pangenome_analysis_project,
-                                     args.pangenome_workflow_uuid,
-                                     args.validated_project,
-                                     schema_ref,
-                                     args.exclude_list)
-
-        copy_most_recent_result(api,
-                                args.pangenome_analysis_project,
-                                args.latest_result_collection)
+        try:
+            seqanalyzer.move_fastq_to_fasta_results()
+
+            new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
+            at_least_one_new_valid_seq = False
+            for c in new_collections:
+                at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq
+
+            if at_least_one_new_valid_seq and not args.no_start_analysis:
+                seqanalyzer.start_pangenome_analysis()
+
+            seqanalyzer.copy_most_recent_result()
+        except Exception as e:
+            logging.exeception("Error in main loop")
 
         if args.once:
             break
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index b3d4d12..0aead3b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -1,6 +1,7 @@
 $base: http://biohackathon.org/bh20-seq-schema
 $namespaces:
   cc:  http://creativecommons.org/ns#
+  dc:  http://purl.org/metadata/dublin_core_elements#
   sch: https://schema.org/
   efo: http://www.ebi.ac.uk/efo/
   obo: http://purl.obolibrary.org/obo/
@@ -15,24 +16,29 @@ $graph:
   fields:
     license_type:
       doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
-      type: string?
+      type: string
       jsonldPredicate:
           _id: https://creativecommons.org/ns#License
     title:
       doc: Attribution title related to data license
       type: string?
       jsonldPredicate:
-          _id: http://semanticscience.org/resource/SIO_001167
+          _id: http://purl.org/metadata/dublin_core_elements#Title
+    attribution_name:
+      doc: Attribution NAME related to data license
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#attributionName
     attribution_url:
       doc: Attribution URL related to data license
       type: string?
       jsonldPredicate:
-          _id: https://creativecommons.org/ns#Work
+          _id: https://creativecommons.org/ns#attributionURL
     attribution_source:
       doc: Attribution source URL related to data license
       type: string?
       jsonldPredicate:
-          _id: https://creativecommons.org/ns#Work
+          _id: https://creativecommons.org/ns#attributionSource
 
 - name: hostSchema
   type: record
@@ -258,6 +264,7 @@ $graph:
     virus: virusSchema
     technology: technologySchema
     submitter: submitterSchema
+    license: ["null", licenseSchema]
     id:
       doc: The subject (eg the fasta/fastq file) that the metadata describes
       type: string
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 965229c..bbc7309 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,6 +1,8 @@
 PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#>
 PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
 PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/>
+PREFIX cc:  <http://creativecommons.org/ns#>
+PREFIX dc:  <http://purl.org/metadata/dublin_core_elements#>
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
 PREFIX obo: <http://purl.obolibrary.org/obo/>
 PREFIX sio: <http://semanticscience.org/resource/>
@@ -15,10 +17,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
   MainSchema:submitter @:submitterShape ;
   MainSchema:technology @:technologyShape ;
   MainSchema:virus @:virusShape;
+  MainSchema:license @:licenseShape ?;
 }
 
 :hostShape  {
-  	efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
+    efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
     sio:SIO_000115 xsd:string ?;
     obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?;
     obo:PATO_0000011 xsd:integer ?;
@@ -32,14 +35,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 
 :sampleShape  {
     sio:SIO_000115 xsd:string;
-	  evs:C25164 xsd:string;
-	  obo:GAZ_00000448 [wikidata:~] ;
+    evs:C25164 xsd:string;
+    obo:GAZ_00000448 [wikidata:~] ;
     obo:OBI_0001895 xsd:string ?;
     obo:NCIT_C41206 xsd:string ?;
     obo:OBI_0001479 IRI {0,2};
     obo:OBI_0001472 xsd:string ?;
     sio:SIO_001167 xsd:string ?;
-	edam:data_2091 IRI {0,3};
+    edam:data_2091 IRI {0,3};
 }
 
 :submitterShape {
@@ -47,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
     sio:SIO_000116 xsd:string *;
     sio:SIO_000172 xsd:string ?;
     obo:NCIT_C37984 xsd:string ?;
-  	obo:NCIT_C37900 xsd:string ?;
+    obo:NCIT_C37900 xsd:string ?;
     efo:EFO_0001741 xsd:string ?;
     obo:NCIT_C42781 xsd:string ?;
     obo:NCIT_C19026 xsd:string ?;
@@ -63,6 +66,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
 }
 
 :virusShape{
-	edam:data_1875 [ obo:NCBITaxon_~ ] ;
-  	sio:SIO_010055 xsd:string ?;
+    edam:data_1875 [ obo:NCBITaxon_~ ] ;
+    sio:SIO_010055 xsd:string ?;
 }
+
+:licenseShape{
+    cc:License xsd:string ;
+    dc:Title xsd:string ?;
+    cc:attributionName xsd:string ?;
+    cc:attributionURL xsd:string ?;
+    cc:attributionSource xsd:string ?;
+}
\ No newline at end of file
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f744a8c..6049bf9 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
     try:
         log.debug("Checking metadata" if do_qc else "Skipping metadata check")
         if do_qc and not qc_metadata(metadata.name):
-            log.warning("Failed metadata qc")
+            log.warning("Failed metadata QC")
             failed = True
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed metadata QC")
         failed = True
 
     target = []
@@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
             target[0] = ("reads_1."+target[0][0][6:], target[0][1])
             target[1] = ("reads_2."+target[1][0][6:], target[0][1])
     except Exception as e:
-        log.debug(e)
-        print(e)
+        log.exception("Failed sequence QC")
         failed = True
 
     if failed:
@@ -82,7 +80,7 @@ def main():
     seqlabel = target[0][1]
 
     if args.validate:
-        print("Valid")
+        log.info("Valid")
         exit(0)
 
     col = arvados.collection.Collection(api_client=api)
@@ -91,10 +89,10 @@ def main():
     if args.sequence_p2:
         upload_sequence(col, target[1], args.sequence_p2)
 
-    print("Reading metadata")
+    log.info("Reading metadata")
     with col.open("metadata.yaml", "w") as f:
         r = args.metadata.read(65536)
-        print(r[0:20])
+        log.info(r[0:20])
         while r:
             f.write(r)
             r = args.metadata.read(65536)
@@ -118,7 +116,7 @@ def main():
                                            ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
     if dup["items"]:
         # This exact collection has been uploaded before.
-        print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+        log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
         exit(1)
 
     if args.trusted:
@@ -131,9 +129,9 @@ def main():
                  (seqlabel, properties['upload_user'], properties['upload_ip']),
                  properties=properties, ensure_unique_name=True)
 
-    print("Saved to %s" % col.manifest_locator())
-
-    print("Done")
+    log.info("Saved to %s" % col.manifest_locator())
+    log.info("Done")
+    exit(0)
 
 if __name__ == "__main__":
     main()
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 37eb4e8..0c7e16d 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True):
                 except Exception as e:
                     logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
 
-                if similarity and similarity < 70.0:
+                if similarity < 70.0:
                     raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity))
-                if similarity == 0:
-                    raise ValueError("QC fail")
 
         return ("sequence.fasta"+gz, seqlabel)
     elif seq_type == "text/fastq":
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index 77e345b..206f884 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -8,7 +8,7 @@ import os
 import sys
 import re
 import string
-import yaml
+import ruamel.yaml as yaml
 import pkg_resources
 from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
 import os.path
@@ -16,6 +16,9 @@ import requests
 import io
 import arvados
 from markupsafe import Markup
+from schema_salad.sourceline import add_lc_filename
+from schema_salad.schema import shortname
+from typing import MutableSequence, MutableMapping
 
 ARVADOS_API = 'lugli.arvadosapi.com'
 ANONYMOUS_TOKEN = '5o42qdxpxp5cj15jqjf7vnxx5xduhm4ret703suuoa3ivfglfh'
@@ -47,6 +50,9 @@ def type_to_heading(type_name):
     Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading.
     """
 
+    type_name = shortname(type_name)
+
+    print(type_name,file=sys.stderr)
     # Remove camel case
     decamel = re.sub('([A-Z])', r' \1', type_name)
     # Split
@@ -77,7 +83,7 @@ def is_iri(string):
 
     return string.startswith('http')
 
-def generate_form(schema, options):
+def generate_form(components, options):
     """
     Linearize the schema into a list of dicts.
 
@@ -100,9 +106,6 @@ def generate_form(schema, options):
     IRI.
     """
 
-    # Get the list of form components, one of which is the root
-    components = schema.get('$graph', [])
-
     # Find the root
     root_name = None
     # And also index components by type name
@@ -130,55 +133,54 @@ def generate_form(schema, options):
             # First make a heading, if we aren't the very root of the form
             yield {'heading': type_to_heading(type_name)}
 
-        for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items():
+        for field in by_name.get(type_name, {}).get('fields', []):
+            field_name = shortname(field["name"])
+            field_type = field["type"]
             # For each field
 
             ref_iri = None
             docstring = None
-            if not isinstance(field_type, str):
-                # If the type isn't a string
-
-                # It may have documentation
-                docstring = field_type.get('doc', None)
-
-                # See if it has a more info/what goes here URL
-                predicate = field_type.get('jsonldPredicate', {})
-                # Predicate may be a URL, a dict with a URL in _id, maybe a
-                # dict with a URL in _type, or a dict with _id and _type but no
-                # URLs anywhere. Some of these may not technically be allowed
-                # by the format, but if they occur, we might as well try to
-                # handle them.
-                if isinstance(predicate, str):
-                    if is_iri(predicate):
-                        ref_iri = predicate
-                else:
-                    # Assume it's a dict. Look at the fields we know about.
-                    for field in ['_id', 'type']:
-                        field_value = predicate.get(field, None)
-                        if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
-                            # Take the first URL-looking thing we find
-                            ref_iri = field_value
-                            break
 
-
-                # Now overwrite the field type with the actual type string
-                field_type = field_type.get('type', '')
-
-            # Decide if the field is optional (type ends in ?)
             optional = False
-            if field_type.endswith('?'):
-                # It's optional
-                optional = True
-                # Drop the ?
-                field_type = field_type[:-1]
-
-            # Decide if the field is a list (type ends in [])
             is_list = False
-            if field_type.endswith('[]'):
-                # It's a list
-                is_list = True
-                # Reduce to the normal type
-                field_type = field_type[:-2]
+
+            # It may have documentation
+            docstring = field.get('doc', None)
+
+            # See if it has a more info/what goes here URL
+            predicate = field.get('jsonldPredicate', {})
+            # Predicate may be a URL, a dict with a URL in _id, maybe a
+            # dict with a URL in _type, or a dict with _id and _type but no
+            # URLs anywhere. Some of these may not technically be allowed
+            # by the format, but if they occur, we might as well try to
+            # handle them.
+            if isinstance(predicate, str):
+                if is_iri(predicate):
+                    ref_iri = predicate
+            else:
+                # Assume it's a dict. Look at the fields we know about.
+                for field in ['_id', 'type']:
+                    field_value = predicate.get(field, None)
+                    if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
+                        # Take the first URL-looking thing we find
+                        ref_iri = field_value
+                        break
+
+            if isinstance(field_type, MutableSequence):
+                if field_type[0] == "null" and len(field_type) == 2:
+                    optional = True
+                    field_type = field_type[1]
+                else:
+                    raise Exception("Can't handle it")
+
+            if isinstance(field_type, MutableMapping):
+                if field_type["type"] == "array":
+                    # Now replace the field type with the actual type string
+                    is_list = True
+                    field_type = field_type.get('items', '')
+                else:
+                    field_type = field_type.get('type', '')
+                    pass
 
             if field_type in by_name:
                 # This is a subrecord. We need to recurse
@@ -226,10 +228,24 @@ def generate_form(schema, options):
     return list(walk_fields(root_name))
 
 
-# At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
-METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
-METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
-FORM_ITEMS = generate_form(METADATA_SCHEMA, METADATA_OPTION_DEFINITIONS)
+import schema_salad.schema
+def load_schema_generate_form():
+    # At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
+    if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
+        METADATA_SCHEMA = yaml.round_trip_load(open("bh20sequploader/bh20seq-schema.yml","r").read())
+        METADATA_OPTION_DEFINITIONS = yaml.safe_load(open("bh20sequploader/bh20seq-options.yml","r").read())
+    else:
+        METADATA_SCHEMA = yaml.round_trip_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
+        METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
+
+    METADATA_SCHEMA["name"] = "bh20seq-schema.yml"
+    add_lc_filename(METADATA_SCHEMA, "bh20seq-schema.yml")
+    metaschema_names, _metaschema_doc, metaschema_loader = schema_salad.schema.get_metaschema()
+    schema_doc, schema_metadata = metaschema_loader.resolve_ref(METADATA_SCHEMA, "")
+
+    return generate_form(schema_doc, METADATA_OPTION_DEFINITIONS)
+
+FORM_ITEMS = load_schema_generate_form()
 
 @app.route('/')
 def send_home():
@@ -237,7 +253,7 @@ def send_home():
     Send the front page.
     """
 
-    return render_template('home.html', menu='HOME')
+    return render_template('home.html', menu='HOME', load_map=True)
 
 
 @app.route('/upload')
@@ -429,17 +445,21 @@ def receive_files():
 
         if result.returncode != 0:
             # It didn't work. Complain.
-            error_message="Uploader returned value {} and said:".format(result.returncode) + str(result.stderr.decode('utf-8'))
+            error_message="Uploader returned value {} and said:\n".format(result.returncode) + str(result.stderr.decode('utf-8'))
             print(error_message, file=sys.stderr)
             return (render_template('error.html', error_message=error_message), 403)
         else:
             # It worked. Say so.
-            return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace'))
+            return render_template('success.html', log=result.stderr.decode('utf-8', errors='replace'))
     finally:
         shutil.rmtree(dest_dir)
 
-def get_html_body(fn):
-    buf = ""
+
+def edit_button(url,text="Edit text!"):
+    return '<p class="editbutton"><a href="'+url+'">'+text+'<img src="static/image/edit.png"></a></p>'
+
+def get_html_body(fn,source="https://github.com/arvados/bh20-seq-resource/tree/master/doc"):
+    buf = edit_button(source)
     in_body = False
     begin_body = re.compile(r"<body>",re.IGNORECASE)
     end_body = re.compile(r"(</body>|.*=\"postamble\")",re.IGNORECASE)
@@ -451,11 +471,12 @@ def get_html_body(fn):
                 buf += line
             elif begin_body.match(line):
                 in_body = True
+    buf += edit_button(source)
     return buf
 
 @app.route('/download')
 def download_page():
-    buf = get_html_body('doc/web/download.html')
+    buf = get_html_body('doc/web/download.html','https://github.com/arvados/bh20-seq-resource/blob/master/doc/web/download.org')
     return render_template('resource.html',menu='DOWNLOAD',embed=buf)
 
 def pending_table(output, items):
@@ -468,10 +489,13 @@ def pending_table(output, items):
     for r in items:
         if r["status"] != "pending":
             continue
-        output.write("<tr>")
-        output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-        output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
-        output.write("</tr>")
+        try:
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+            output.write("</tr>")
+        except:
+            pass
     output.write(
 """
 </table>
@@ -486,18 +510,69 @@ def rejected_table(output, items):
 <th>Errors</th></tr>
 """)
     for r in items:
-        if r["status"] != "rejected":
-            continue
+        try:
+            if r["status"] != "rejected":
+                continue
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+            output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+            output.write("</tr>")
+        except:
+            pass
+    output.write(
+"""
+</table>
+""")
+
+def workflows_table(output, items):
+    output.write(
+"""
+<table>
+<tr>
+<th>Name</th>
+<th>Sample id</th>
+<th>Started</th>
+<th>Container request</th>
+</tr>
+""")
+    for r in items:
         output.write("<tr>")
-        output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
-        output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
-        output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+        try:
+            sid = r["mounts"]["/var/lib/cwl/cwl.input.json"]["content"]["sample_id"]
+            output.write("<td>%s</td>" % Markup.escape(r["name"]))
+            output.write("<td>%s</td>" % Markup.escape(sid))
+            output.write("<td>%s</td>" % Markup.escape(r["created_at"]))
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/container_requests/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+        except:
+            pass
         output.write("</tr>")
     output.write(
 """
 </table>
 """)
 
+def validated_table(output, items):
+    output.write(
+"""
+<table>
+<tr>
+<th>Collection</th>
+<th>Sequence label</th>
+</tr>
+""")
+    for r in items:
+        try:
+            output.write("<tr>")
+            output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+            output.write("<td>%s</td>" % Markup.escape(r["properties"].get("sequence_label")))
+            output.write("</tr>")
+        except:
+            pass
+    output.write(
+"""
+</table>
+""")
 
 @app.route('/status')
 def status_page():
@@ -518,46 +593,57 @@ def status_page():
         prop["uuid"] = p["uuid"]
         status[prop["status"]] = status.get(prop["status"], 0) + 1
 
+    workflows = arvados.util.list_all(api.container_requests().list,
+                                      filters=[["name", "in", ["fastq2fasta.cwl"]], ["state", "=", "Committed"]],
+                                      order="created_at asc")
+
     output = io.StringIO()
 
     validated = api.collections().list(filters=[["owner_uuid", "=", VALIDATED_PROJECT]], limit=1).execute()
     status["passed"] = validated["items_available"]
 
-    for s in (("passed", "/download"), ("pending", "#pending"), ("rejected", "#rejected")):
+    for s in (("passed", "/validated"), ("pending", "#pending"), ("rejected", "#rejected")):
         output.write("<p><a href='%s'>%s sequences QC %s</a></p>" % (s[1], status.get(s[0], 0), s[0]))
 
-    output.write("<a id='pending'><h1>Pending</h1>")
+    output.write("<p><a href='%s'>%s analysis workflows running</a></p>" % ('#workflows', len(workflows)))
+
+    output.write("<a id='pending'><h1>Pending</h1></a>")
     pending_table(output, out)
 
-    output.write("<a id='rejected'><h1>Rejected</h1>")
+    output.write("<a id='rejected'><h1>Rejected</h1></a>")
     rejected_table(output, out)
 
+    output.write("<a id='workflows'><h1>Running Workflows</h1></a>")
+    workflows_table(output, workflows)
+
     return render_template('status.html', table=Markup(output.getvalue()), menu='STATUS')
 
+@app.route('/validated')
+def validated_page():
+    api = arvados.api(host=ARVADOS_API, token=ANONYMOUS_TOKEN, insecure=True)
+    output = io.StringIO()
+    validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", VALIDATED_PROJECT]])
+    validated_table(output, validated)
+    return render_template('validated.html', table=Markup(output.getvalue()), menu='STATUS')
+
 @app.route('/demo')
 def demo_page():
-    return render_template('demo.html',menu='DEMO')
+    return render_template('demo.html',menu='DEMO',load_map=True)
 
 @app.route('/blog',methods=['GET'])
 def blog_page():
     blog_content = request.args.get('id') # e.g. using-covid-19-pubseq-part3
     buf = None;
     if blog_content:
-        buf = get_html_body('doc/blog/'+blog_content+'.html')
+        buf = get_html_body('doc/blog/'+blog_content+'.html',"https://github.com/arvados/bh20-seq-resource/blob/master/doc/blog/"+blog_content+".org")
     return render_template('blog.html',menu='BLOG',embed=buf,blog=blog_content)
 
 
 @app.route('/about')
 def about_page():
-    buf = get_html_body('doc/web/about.html')
+    buf = get_html_body('doc/web/about.html','https://github.com/arvados/bh20-seq-resource/blob/master/doc/web/about.org')
     return render_template('about.html',menu='ABOUT',embed=buf)
 
-##
-@app.route('/map')
-def map_page():
-    return render_template('map.html',menu='DEMO')
-
-
 
 ## Dynamic API functions starting here
 ## This is quick and dirty for now, just to get something out and demonstrate the queries
diff --git a/bh20simplewebuploader/static/blog.css b/bh20simplewebuploader/static/blog.css
index 82a7c51..3ee8c44 100644
--- a/bh20simplewebuploader/static/blog.css
+++ b/bh20simplewebuploader/static/blog.css
@@ -4,12 +4,13 @@
 .timestamp { font-family: monospace; color: darkgreen; }
 
 h1,h2 { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace; color:black;background-color:#F0F8FF; }
-h2 { color: darkblue; }
-h3,h4 { color: black; }
+h2 { color: black; }
+h3,h4 { color: black; margin:0; }
 code { color: darkblue; }
-body {font-family: Palatino, 'Palatino Linotype', serif; color:black;background-color:#e5d8f0; font-size: large }
+body {font-family: Palatino, 'Palatino Linotype', serif; color:black; background-color:white; font-size: large }
 
-div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset; font-family: palatino font, monospace; font-size:80%;  font-weight:bold; }
+div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset;
+               font-family: palatino font, monospace; font-size:80%;  font-weight:bold; }
 div.quote { font-family: palatino font, monospace; font-size:80%; }
 div.quotation { font-family: palatino font, monospace; font-size:80%; }
 pre.example { margin: 30px; font-family: prestige, monospace; color:black; font-size:70%; background-color: lightyellow;  }
@@ -23,10 +24,10 @@ div[id="text-table-of-contents"]{
 }
 div[class^="outline-text"] {
     margin: 10px;
-    background-color:#ebe6f0;
-    border-style: dotted;
-    border-color: #98bf21;
-    border-width: 1px;
+    // background-color:white;
+    // border-style: dotted;
+    // border-color: #98bf21;
+    // border-width: 1px;
     font-family: Palatino, 'Palatino Linotype', serif; color:black; font-size: large
 }
 span[class="todo TESTING"] {
diff --git a/bh20simplewebuploader/static/image/CWL.png b/bh20simplewebuploader/static/image/CWL.png
new file mode 100644
index 0000000..81d1807
--- /dev/null
+++ b/bh20simplewebuploader/static/image/CWL.png
Binary files differdiff --git a/bh20simplewebuploader/static/image/ESR.png b/bh20simplewebuploader/static/image/ESR.png
new file mode 100644
index 0000000..557c798
--- /dev/null
+++ b/bh20simplewebuploader/static/image/ESR.png
Binary files differdiff --git a/bh20simplewebuploader/static/image/curii.logo.ai.png b/bh20simplewebuploader/static/image/curii.logo.ai.png
new file mode 100644
index 0000000..401afad
--- /dev/null
+++ b/bh20simplewebuploader/static/image/curii.logo.ai.png
Binary files differdiff --git a/bh20simplewebuploader/static/image/curii.logo.ai.svg b/bh20simplewebuploader/static/image/curii.logo.ai.svg
new file mode 100644
index 0000000..e87ea05
--- /dev/null
+++ b/bh20simplewebuploader/static/image/curii.logo.ai.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg width="1333.3" height="1333.3" version="1.1" viewBox="0 0 1333.3 1333.3" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><metadata><rdf:RDF><cc:Work rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/><dc:title/></cc:Work></rdf:RDF></metadata><defs><clipPath id="clipPath18"><path d="m0 1e3h1e3v-1e3h-1e3z"/></clipPath></defs><g transform="matrix(1.3333 0 0 -1.3333 0 1333.3)"><g clip-path="url(#clipPath18)"><g transform="translate(473.7 553.52)"><path d="m0 0c-23.91 0-41.063-17.932-41.063-44.182 0-26.251 17.153-44.179 41.063-44.179 17.414 0 31.448 5.714 40.805 16.63l16.63-16.63c-12.476-16.377-32.488-25.735-58.734-25.735-41.063 0-69.653 28.329-69.653 69.914 0 41.843 28.59 70.428 69.653 70.428 24.95 0 44.439-8.053 57.174-22.611l-16.37-19.229c-9.619 9.876-22.875 15.851-39.505 15.594" fill="#008a82"/></g><g transform="translate(681.34 578.73)"><path d="m0 0v-138.26h-29.889v27.029c-9.355-19.232-26.765-27.806-49.377-28.07-31.707 0-50.676 20.276-50.676 53.281v86.022h29.884v-77.707c0-20.792 12.216-33.526 32.228-33.526 23.911 0.523 37.941 18.972 37.941 43.143v68.09z" fill="#008a82"/></g><g transform="translate(796.73 579.77)"><path d="m0 0v-28.585c-27.809 1.556-45.481-14.815-47.818-38.468v-72.248h-29.887v138.26h29.887v-27.546c9.356 18.71 25.99 28.585 47.818 28.585" fill="#008a82"/></g><g transform="translate(847.15 618.24)"><path d="m0 0c0-10.136-7.54-17.673-17.412-17.673-9.877 0-17.413 7.537-17.413 17.673 0 10.396 7.536 17.933 17.413 17.933 9.872 0 17.412-7.537 17.412-17.933" fill="#f15a29"/></g><path d="m844.54 440.47h-29.625v138.26h29.625z" fill="#008a82"/><g transform="translate(914.46 618.24)"><path d="m0 0c0-10.136-7.541-17.673-17.413-17.673-9.876 0-17.413 7.537-17.413 17.673 0 10.396 7.537 17.933 17.413 17.933 9.872 0 17.413-7.537 17.413-17.933" fill="#f15a29"/></g><path d="m911.85 440.47h-29.625v138.26h29.625z" fill="#008a82"/><g transform="translate(131.67 492.85)"><path d="m0 0c0.628 4.633 3.756 8.256 7.824 9.843l-9.454 65.551c-0.213 0.021-0.426-0.01-0.638 0.021-2.008 0.27-3.807 1.046-5.367 2.126l-57.718-52.611c2.31-21.875 9.31-42.198 19.929-60.052l46.065 29.321c-0.641 1.805-0.924 3.766-0.641 5.801" fill="#f15a29"/></g><g transform="translate(135.54 569.04)"><path d="m0 0 9.451-65.545c0.223-0.018 0.432 0.01 0.648-0.02 0.975-0.131 1.887-0.395 2.767-0.737l38.262 56.827-44.955 14.619c-1.414-2.373-3.608-4.149-6.173-5.144" fill="#f15a29"/></g><g transform="translate(280.25 609.22)"><path d="m0 0-48.729-0.574c-1.211-5.586-6.039-9.562-11.655-9.744l-24.468-36.342 53.617-17.44c0 0.809-0.074 1.62-0.051 2.43 0.912 25.152 12.634 47.22 31.286 61.67" fill="#f15a29"/></g><g transform="translate(217.14 623.42)"><path d="m0 0 2.255 58.382c-4.958 0.183-9.984 0.112-15.047-0.205-15.084-0.972-29.49-4.186-42.934-9.259l51.399-50.633c1.31 0.834 2.768 1.411 4.327 1.715" fill="#f15a29"/></g><g transform="translate(132.75 592.86)"><path d="m0 0c6.582-0.901 11.22-6.828 10.639-13.386l46.505-15.128 24.123 35.815c-4.675 2.295-7.615 7.341-6.872 12.785 0.239 1.772 0.867 3.403 1.748 4.824l-53.093 52.29c-12.31-5.236-23.718-12.06-34.013-20.225l9.958-56.937c0.33-0.028 0.661 6e-3 1.005-0.038" fill="#f15a29"/></g><g transform="translate(135.28 482.35)"><path d="m0 0-46.12-29.351c14.73-22.781 35.474-41.144 59.841-53.023l-6.781 78.911c-2.738 0.383-5.104 1.664-6.94 3.463" fill="#008a82"/></g><g transform="translate(118.77 582.23)"><path d="m0 0c0.611 4.509 3.588 8.073 7.483 9.721l-9.4 53.816c-33.716-29.174-53.94-73.126-51.126-121.01l54.559 49.732c-1.283 2.272-1.897 4.952-1.516 7.74" fill="#f15a29"/></g><g transform="translate(361.67 546.89)"><path d="m0 0c-7.048-4.637-14.905-8.13-23.401-10.048-23.806-5.372-49.043-2.852-73.21-5.508 3.301-19.489 10.484-39.441 11.517-59.119 1.522-29.071-4.033-57.766-26.257-78.143-2.545-2.344-5.248-4.393-8.036-6.28 72.262 13.991 124.59 79.812 119.73 155.21-0.078 1.306-0.227 2.592-0.345 3.888" fill="#008a82"/></g><g transform="translate(250.67 529.01)"><path d="m0 0c-8.863-1.971-17.477-5.104-25.76-9.947 7.733-9.046 15.469-18.091 23.188-27.143 4.901-5.729-4.405-12.76-9.275-7.059-7.844 9.184-15.692 18.355-23.533 27.543-3.244-2.674-6.116-5.496-8.563-8.494 13.566-16.281 27.111-32.576 40.666-48.876 4.826-5.793-4.486-12.83-9.286-7.061-12.67 15.225-25.348 30.468-38.019 45.701-2.14-4.209-3.767-8.604-4.84-13.168 8.495-9.946 16.998-19.892 25.493-29.843 4.898-5.729-4.411-12.758-9.281-7.061-6.019 7.032-12.026 14.071-18.048 21.114 0 0-4.766-16.237-0.865-52.291-1.66 0.746-3.085 2.109-3.847 4.189-9.88 27.309-16.654 62.017 0.85 88.074 12.678 18.854 32.288 30.459 53.735 36.456l-51.115 16.627-39.174-58.167c2.461-2.63 3.791-6.261 3.274-10.104-0.661-4.881-4.104-8.68-8.509-10.1l7.041-82.087c11.054-4.806 22.783-8.293 34.971-10.297 31.35 2.055 62.177 14.151 70.26 48.559 4.169 17.775 3.976 36.163 0.456 54.069-2.558 13.035-7.277 26.098-9.819 39.366" fill="#008a82"/></g><g transform="translate(312.28 562.6)"><path d="m0 0c-9.515 0-17.227 7.709-17.227 17.228 0 9.514 7.712 17.227 17.227 17.227 9.519 0 17.228-7.713 17.228-17.227 0-9.519-7.709-17.228-17.228-17.228m-87.342 118.87-2.248-58.267c4.425-1.198 7.753-4.685 8.799-9.009l56.965 0.679c7.689 4.574 16.285 7.993 25.578 9.991 2.66 0.576 4.931-0.281 6.548-1.786l-2.677 1.573c-28.575-13.342-42.238-34.715-48.462-48.807-2.258-4.6-3.79-9.265-4.739-13.983-0.01-0.048-0.03-0.179-0.03-0.179-1.08-5.452-1.428-10.967-1.175-16.536 30.725 3.518 70.847-3.875 96.101 16.526-12.633 65.686-68.086 114.92-134.66 119.8" fill="#f15a29"/></g></g></g></svg>
\ No newline at end of file
diff --git a/bh20simplewebuploader/static/image/edit.png b/bh20simplewebuploader/static/image/edit.png
new file mode 100644
index 0000000..571b08c
--- /dev/null
+++ b/bh20simplewebuploader/static/image/edit.png
Binary files differdiff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css
index b9b27f4..6e651a4 100644
--- a/bh20simplewebuploader/static/main.css
+++ b/bh20simplewebuploader/static/main.css
@@ -47,7 +47,7 @@ h2 > svg {
     float: right;
 }
 
-#map {
+#mapid {
     width: 800px;
     height: 440px;
     border: 1px solid #AAA;
@@ -178,7 +178,7 @@ span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; }
 
 .about {
     display: grid;
-    grid-template-columns: repeat(2, 1fr);
+    grid-template-columns: 1fr 1fr;
     grid-auto-flow: row;
 }
 
@@ -229,7 +229,7 @@ a {
 #metadata_fill_form {
     column-count: 4;
     margin-top: 0.5em;
-    column-width: 250px;
+    column-width: 15em;
 }
 
 .record, .record .field-group, .record .field-group .field {
@@ -238,6 +238,8 @@ a {
     -webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */
     page-break-inside: avoid; /* Firefox */
     break-inside: avoid;
+    display: block;
+    width: 90%;
 }
 
 .record {
@@ -258,6 +260,10 @@ a {
     width: max-content;
 }
 
+.control {
+    width: 100%;
+}
+
 .filter-options {
     width: 100%;
 }
@@ -304,9 +310,10 @@ footer {
 }
 
 .sponsors img {
-    width: 80%;
-    display:block;
-    margin:auto;
+    width: auto;
+    display: block;
+    margin: auto;
+    height: 4em;
 }
 
 .loader {
@@ -377,3 +384,22 @@ div.status {
     vertical-align: top;
     border-bottom: 1px solid #ddd;
 }
+
+.map {
+    padding: 20px 32px;
+    // display: inline-block;
+}
+
+.editbutton {
+    float: right;
+    text-align: right;
+    background-color: lightgrey;
+    border: 2px solid #4CAF50;
+    border-radius: 12px;
+    color: black;
+    padding: 5px 32px;
+    // text-decoration: none;
+    display: inline-block;
+    font-size: 16px;
+    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2), 0 6px 20px 0 rgba(0,0,0,0.19);
+}
diff --git a/bh20simplewebuploader/static/main.js b/bh20simplewebuploader/static/main.js
index 751e478..1633c25 100644
--- a/bh20simplewebuploader/static/main.js
+++ b/bh20simplewebuploader/static/main.js
@@ -13,70 +13,41 @@ function myFunction() {
     }
 }
 
-let map = L.map( 'map', {
-  center: [37.0902, -95.7129],  // Default to U.S.A
-  minZoom: 3,
-  zoom: 0
-});
-L.tileLayer( 'http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
-  attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a>',
-  subdomains: ['a','b','c']
-}).addTo( map );
-
-let markers = L.markerClusterGroup().addTo(map)
-
-
 function fetchAPI(apiEndPoint) {
-  fetch(scriptRoot + apiEndPoint)
-    .then(response => {
-      return response.json();
-    })
-    .then(data => {
-        console.log(data);
-      markers.clearLayers();
-      document.getElementById("results").classList.remove("invisible");
-      document.getElementById("loader").classList.add("invisible");
-      if (!(apiEndPoint === "/api/getAllaccessions")) {
-        for (let i = 0; i < data.length; i++) {
-          let {"count": fastaCount, GPS, LocationLabel: label } = data[i];
-          let coordinates = GPS.split(" ");
-          if (!(coordinates == null)) {
-            let lat, lon;
-            [lon, lat] = coordinates.map(parseFloat);
-            let point = L.point()
-            let marker = L.marker([lat, lon]);
-            marker.bindPopup("<b>" + label + "</b><br/>" + "FastaCount: " +fastaCount);
-            markers.addLayer(marker)
-          }}
-      }
-      // Reload the map
-      map.invalidateSize();
-    });
-  document.getElementById("results").classList.add("invisible");
-  document.getElementById("loader").classList.remove("invisible");
-
-}
-
-// Copy from function above but now added as table instead of plain json
-function fetchAPIV2(apiEndPoint) {
-  fetch(scriptRoot + apiEndPoint)
-    .then(response => {
-      return response.json();
-    })
-    .then(data => {
-      console.log(data)
-       htmlString="<table>"
-
-       // Depending on what we want to explore we'd have to call a different function ....? But how to Include that?
-       for (var i=0; i<data.length;i++) {
-            htmlString=htmlString+"<tr><td><a href='#' onclick='fetchSEQByLocation(\""+data[i]["key"]+"\");'>"+data[i]["label"]+"</a></td><td>"+data[i]["count"]+"<td></tr>"
-       }
-       htmlString=htmlString+"</table>"
-
-      document.getElementById("table").innerHTML = htmlString
-    });
-
-  document.getElementById("results").classList.add("invisible");
+    fetch(scriptRoot + apiEndPoint)
+        .then(response => {
+            return response.json();
+        })
+        .then(data => {
+            console.log(data);
+        });
+    document.getElementById("map_view").classList.add("invisible");
+    document.getElementById("loader").classList.remove("invisible");
+}
+
+// Copy from function above but now output HTML table instead of plain json
+function fetchHTMLTable(apiEndPoint) {
+    fetch(scriptRoot + apiEndPoint)
+        .then(response => {
+            return response.json();
+        })
+        .then(data => {
+            console.log(data)
+            htmlString="<table>"
+
+            // Depending on what we want to explore we'd have to call a different function ....? But how to Include that?
+            /*
+            for (var i=0; i<data.length;i++) {
+                htmlString=htmlString+"<tr><td><a href='#' onclick='fetchSEQByLocation(\""+data[i]["key"]+"\");'>"+data[i]["label"]+"</a></td><td>"+data[i]["count"]+"<td></tr>"
+            }
+*/
+            for (var i=0; i<data.length;i++) {
+                htmlString=htmlString+"<tr><td>"+data[i]["label"]+"</td><td>"+data[i]["count"]+"<td></tr>"
+            }
+            htmlString=htmlString+"</table>"
+
+            document.getElementById("table").innerHTML = htmlString
+        });
 }
 
 
@@ -85,36 +56,39 @@ let search = () => {
   fetchAPI(scriptRoot + "/api/getDetailsForSeq?seq=" + encodeURIComponent(m));
 }
 
+// Get count from Arvados
 let fetchCount = () => {
   fetchAPI("/api/getCount");
 }
 
+// Get count from Virtuoso
 let fetchCountDB = () => {
   fetchAPI("/api/getCountDB");
 }
 
 let fetchSEQCountBySpecimen = () => {
-  fetchAPIV2("/api/getSEQCountbySpecimenSource");
+  fetchHTMLTable("/api/getSEQCountbySpecimenSource");
 }
 
 let fetchSEQCountByLocation = () => {
-  fetchAPIV2("/api/getSEQCountbyLocation");
+  fetchHTMLTable("/api/getSEQCountbyLocation");
 }
 
 let fetchSEQCountByTech = () => {
-  fetchAPIV2("/api/getSEQCountbytech");
+  fetchHTMLTable("/api/getSEQCountbytech");
 }
 
 let fetchAllaccessions = () => {
-  fetchAPI("/api/getAllaccessions");
+  fetchHTMLTable("/api/getAllaccessions");
 };
 
-let fetchCountByGPS = () => {
-  fetchAPI("/api/getCountByGPS");
+let fetchMap = () => {
+    fetchAPI("/api/getCountByGPS");
+    updateMapMarkers();
 };
 
 let fetchSEQCountbyLocation = () => {
-  fetchAPIV2("/api/getSEQCountbyLocation");
+  fetchHTMLTable("/api/getSEQCountbyLocation");
 };
 
 let fetchSEQByLocation = () => {
@@ -122,7 +96,7 @@ let fetchSEQByLocation = () => {
 };
 
 let fetchSEQCountbyContinent = () => {
-  fetchAPIV2("/api/getSEQCountbyContinent");
+  fetchHTMLTable("/api/getSEQCountbyContinent");
 }
 
 
@@ -195,7 +169,7 @@ function addField(e) {
   // Increment the number and use the keypath and number to set IDs and cross
   // references.
   // TODO: Heavily dependent on the form field HTML. Maybe we want custom
-  // elements for the labeled controlsd that know how to be list items?
+  // elements for the labeled controls that know how to be list items?
   fieldNumber++
   newField.dataset.number = fieldNumber
   let newID = keypath + '[' + fieldNumber + ']'
@@ -252,36 +226,3 @@ function on_submit_button() {
         return false;
     }
 }
-
-
-
-//
-
-function drawMap(){
-
-// initialize the map on the "map" div with a given center and zoom
-var mymap = L.map('mapid').setView([51.505, -0.09], 1);
-
-L.tileLayer('https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
-    attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors'
-}).addTo(mymap);
-
-fetch(scriptRoot + "api/getCountByGPS")
-    .then(response => {
-    console.log(response)
-      return response.json();
-    })
-    .then(data => {
-
-   for (var i=0; i<data.length;i++) {
-   gps=data[i]["GPS"].split(" ")
-    var circle = L.circle([gps[1], gps[0]], {
-    color: 'red',
-    fillColor: '#f03',
-    fillOpacity: 0.5,
-    radius: parseInt(data[i]["count"])  //not working for whatever reason
-        }).addTo(mymap);
-      }
-
-      });
-}
diff --git a/bh20simplewebuploader/static/map.js b/bh20simplewebuploader/static/map.js
new file mode 100644
index 0000000..1003f7d
--- /dev/null
+++ b/bh20simplewebuploader/static/map.js
@@ -0,0 +1,50 @@
+
+var map = L.map( 'mapid', {
+    center: [51.505, -0.09],  // Default to U.S.A
+    minZoom: 2,
+    zoom: 0
+});
+
+L.tileLayer( 'http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
+    attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> | <a href="http://covid19.genenetwork.org/">COVID-19 PubSeq</a>',
+    subdomains: ['a','b','c']
+}).addTo(map);
+
+
+function drawMap(){
+    var mymap = map;
+
+    fetch(scriptRoot + "api/getCountByGPS")
+        .then(response => {
+            console.log(response)
+            return response.json();
+        })
+        .then(data => {
+            updateMapMarkers(data);
+
+      });
+    document.getElementById("map_view").classList.remove("invisible");
+    map.invalidateSize();
+}
+
+
+
+/* This function updates the map with markers
+ *
+*/
+function updateMapMarkers(data) {
+    let markers = L.markerClusterGroup();
+    for (let i = 0; i < data.length; i++) {
+        let {"count": fastaCount, GPS, LocationLabel: label } = data[i];
+        let coordinates = GPS.split(" ");
+        if (!(coordinates == null)) {
+            let lat, lon;
+            [lon, lat] = coordinates.map(parseFloat);
+            let point = L.point()
+            marker = (L.marker([lat, lon]));
+            marker.bindPopup("<b>" + label + "</b><br/>" + "SARS-CoV-2<br/>sequences: " +fastaCount);
+            markers.addLayer(marker);
+        }
+    }
+    map.addLayer(markers);
+}
diff --git a/bh20simplewebuploader/templates/blog.html b/bh20simplewebuploader/templates/blog.html
index dbc0b99..823f8a1 100644
--- a/bh20simplewebuploader/templates/blog.html
+++ b/bh20simplewebuploader/templates/blog.html
@@ -9,9 +9,7 @@
     {{ embed|safe }}
     <hr>
 
-    <p>
-      Other blog entries:
-    </p>
+    <h1>Other blog entries</h1>
 
     {% else %}
     {% include 'blurb.html' %}
diff --git a/bh20simplewebuploader/templates/blurb.html b/bh20simplewebuploader/templates/blurb.html
index 80fd384..9eef7c2 100644
--- a/bh20simplewebuploader/templates/blurb.html
+++ b/bh20simplewebuploader/templates/blurb.html
@@ -2,7 +2,7 @@
   This is the COVID-19 Public Sequence Resource (COVID-19 PubSeq) for
   SARS-CoV-2 virus sequences. COVID-19 PubSeq is a repository for
   sequences with a low barrier to entry for uploading sequence data
-  using best practices. I.e., data published with a creative commons
+  using best practices, including <a href="https://en.wikipedia.org/wiki/FAIR_data">FAIR data</a>. I.e., data published with a creative commons
   CC0 or CC-4.0 license with metadata using state-of-the art standards
   and, perhaps most importantly, providing standardised workflows that
   get triggered on upload, so that results are immediately available
diff --git a/bh20simplewebuploader/templates/demo-run.html b/bh20simplewebuploader/templates/demo-run.html
deleted file mode 100644
index a8f9edc..0000000
--- a/bh20simplewebuploader/templates/demo-run.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<section class="search-section">
-  <div class="filter-options" action="#">
-    <p>[Demo] Display content sequences by: </p>
-    <div>
-      <button class="button" onclick="fetchSEQCountBySpecimen()">Count by Specimen source</button>
-      <button class="button" onclick="fetchSEQCountByLocation()">Count by Location</button>
-      <button class="button" onclick="fetchSEQCountByTech()">Count by Sequencer</button>
-      <button class="button" onclick="fetchAllaccessions()">Show All accessions</button>
-      <button class="button" onclick="fetchSEQCountbyContinent()">Count by Continent</button>
-      <button class="button" onclick="fetchCountByGPS()">Map</button>
-
-    </div>
-
-  </div>
-
-</section>
-<div id="loader" class="loader invisible">
-</div>
-
-<section id="results" class="invisible">
-    <div id="map"></div>
-</section>
-
- <section>
-    <div id="table"></div>
- </section>
diff --git a/bh20simplewebuploader/templates/demo.html b/bh20simplewebuploader/templates/demo.html
index 44aded0..75bc0e2 100644
--- a/bh20simplewebuploader/templates/demo.html
+++ b/bh20simplewebuploader/templates/demo.html
@@ -1,13 +1,51 @@
 <!DOCTYPE html>
 <html>
   {% include 'header.html' %}
+  {% include 'mapheader.html' %}
   <body>
     {% include 'banner.html' %}
     {% include 'menu.html' %}
-    {% include 'search.html' %}
-      <p>The Virtuoso database contains <span id="CounterDB"></span> public sequences!</p>
-    {% include 'demo-run.html' %}
-    {% include 'footer.html' %}
+
+    <p>The Virtuoso database contains <span id="CounterDB"></span> public sequences!</p>
+
+    <!--
+    <div class="search">
+      <input id="search-input" type="search" placeholder="FASTA uri" required>
+      <button class="button search-button" type="submit" onclick="search()">
+        <span class="icon ion-search">
+          <span class="sr-only">Search</span>
+        </span>
+      </button>
+      <span class="dropt" title="http://collections.lugli.arvadosapi.com/c=00fede2c6f52b053a14edca01cfa02b7+126/sequence.fasta">(example)<span style="width:500px;"></span></span>
+    </div>
+    -->
+
+    <section class="search-section">
+      <div class="filter-options" action="#">
+        <p>[Demo] Display content sequences by: </p>
+        <div>
+          <button class="button" onclick="fetchSEQCountBySpecimen()">Count by Specimen source</button>
+          <button class="button" onclick="fetchSEQCountByLocation()">Count by Location</button>
+          <button class="button" onclick="fetchSEQCountByTech()">Count by Sequencer</button>
+          <!-- <button class="button" onclick="fetchAllaccessions()">Show All accessions</button> -->
+          <button class="button" onclick="fetchSEQCountbyContinent()">Count by Continent</button>
+        </div>
+
+      </div>
+
+    </section>
+    <div id="loader" class="loader invisible">
+    </div>
+
+    <section id="map_view" class="map">
+      <div id="mapid"></div>
+    </section>
+
+    <section>
+      <div id="table"></div>
+    </section>
+
+ {% include 'footer.html' %}
 
     <script type="text/javascript">
       let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
@@ -24,7 +62,10 @@
               });
       });
 
+     drawMap()
+
     </script>
+
   </body>
 
 </html>
diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html
index b1d9402..fc08aed 100644
--- a/bh20simplewebuploader/templates/error.html
+++ b/bh20simplewebuploader/templates/error.html
@@ -15,7 +15,7 @@
           </pre>
         </p>
         <p>
-            <a href="/">Click here to try again.</a>
+            <a href="/upload">Click here to try again.</a>
         </p>
         <hr>
     </body>
diff --git a/bh20simplewebuploader/templates/footer.html b/bh20simplewebuploader/templates/footer.html
index a1dd4fd..f84cef5 100644
--- a/bh20simplewebuploader/templates/footer.html
+++ b/bh20simplewebuploader/templates/footer.html
@@ -21,14 +21,21 @@
           <img src="static/image/covid19biohackathon.png"></a>
       </div>
       <div class="sponsorimg">
-        <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png"></a>
+        <a href="https://www.curii.com/"><img src="static/image/curii.logo.ai.png"></a>
       </div>
       <div class="sponsorimg">
         <a href="https://arvados.org/"><img src="static/image/arvados-logo.png"></a>
       </div>
       <div class="sponsorimg">
+        <a href="https://www.commonwl.org/"><img src="static/image/CWL.png"></a>
+      </div>
+      <div class="sponsorimg">
         <a href="https://uthsc.edu/"><img src="static/image/UTHSC-primary-stacked-logo-4c.png"></a>
       </div>
+      <div class="sponsorimg">
+        <a href="https://www.esr.cri.nz/"><img src="static/image/ESR.png"></a>
+      </div>
+
     </div>
   </div>
   <div class="footer">
@@ -40,6 +47,9 @@
     </center>
   </div>
 </section>
+{% if load_map %}
+<script type="text/javascript" src="/static/map.js"></script>
+{% endif %}
 <script type="text/javascript" src="/static/main.js"></script>
 
 <script type="text/javascript">
diff --git a/bh20simplewebuploader/templates/header.html b/bh20simplewebuploader/templates/header.html
index 0ac5157..1d66590 100644
--- a/bh20simplewebuploader/templates/header.html
+++ b/bh20simplewebuploader/templates/header.html
@@ -6,22 +6,4 @@
         {% if blog %}
         <link rel="Blog stylesheet" type="text/css" href="/static/blog.css" />
         {% endif %}
-        <link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
-              integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
-              crossorigin=""/>
-        <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.css"
-              integrity="sha512-RLEjtaFGdC4iQMJDbMzim/dOvAu+8Qp9sw7QE4wIMYcg2goVoivzwgSZq9CsIxp4xKAZPKh5J2f2lOko2Ze6FQ=="
-              crossorigin=""/>
-
-        <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.Default.css"
-              integrity="sha512-BBToHPBStgMiw0lD4AtkRIZmdndhB6aQbXpX7omcrXeG2PauGBl2lzq2xUZTxaLxYz5IDHlmneCZ1IJ+P3kYtQ=="
-              crossorigin=""/>
-
-        <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
-                integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
-                crossorigin=""></script>
-
-        <script src="https://unpkg.com/leaflet.markercluster@1.4.1/dist/leaflet.markercluster.js"
-                integrity="sha512-MQlyPV+ol2lp4KodaU/Xmrn+txc1TP15pOBF/2Sfre7MRsA/pB4Vy58bEqe9u7a7DczMLtU5wT8n7OblJepKbg=="
-                crossorigin=""></script>
     </head>
diff --git a/bh20simplewebuploader/templates/home.html b/bh20simplewebuploader/templates/home.html
index b90a18d..bede611 100644
--- a/bh20simplewebuploader/templates/home.html
+++ b/bh20simplewebuploader/templates/home.html
@@ -1,6 +1,7 @@
 <!DOCTYPE html>
 <html>
   {% include 'header.html' %}
+  {% include 'mapheader.html' %}
     <body>
       {% include 'banner.html' %}
       {% include 'menu.html' %}
@@ -44,7 +45,19 @@
                 </div>
         </section>
 
-{% include 'footer.html' %}
+      <section id="map_view" class="map">
+        <div id="mapid"></div>
+      </section>
+
+      {% include 'footer.html' %}
+
+
+      <script type="text/javascript">
+        let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
+
+        drawMap()
+
+      </script>
 
    </body>
 </html>
diff --git a/bh20simplewebuploader/templates/map.html b/bh20simplewebuploader/templates/map.html
deleted file mode 100644
index 595af0c..0000000
--- a/bh20simplewebuploader/templates/map.html
+++ /dev/null
@@ -1,33 +0,0 @@
-<!DOCTYPE html>
-<html>
-  {% include 'header.html' %}
-<link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
-   integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
-   crossorigin=""/>
-
-    {% include 'banner.html' %}
-    {% include 'menu.html' %}
-    <div id="mapid" style="height: 500px;"></div>
-
-    {% include 'footer.html' %}
-
-
-
-
-       <script type="text/javascript">
-        let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
-      </script>
-
-<!-- Make sure you put this AFTER Leaflet's CSS -->
- <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
-   integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
-   crossorigin=""></script>
-
-  <script>
-     //drawMap
-     drawMap()
-  </script>
-
-  </body>
-
-</html>
diff --git a/bh20simplewebuploader/templates/mapheader.html b/bh20simplewebuploader/templates/mapheader.html
new file mode 100644
index 0000000..ca62051
--- /dev/null
+++ b/bh20simplewebuploader/templates/mapheader.html
@@ -0,0 +1,16 @@
+  <link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
+        integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
+        crossorigin=""/>
+  <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.css"
+        integrity="sha512-RLEjtaFGdC4iQMJDbMzim/dOvAu+8Qp9sw7QE4wIMYcg2goVoivzwgSZq9CsIxp4xKAZPKh5J2f2lOko2Ze6FQ=="
+        crossorigin=""/>
+  <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.Default.css"
+        integrity="sha512-BBToHPBStgMiw0lD4AtkRIZmdndhB6aQbXpX7omcrXeG2PauGBl2lzq2xUZTxaLxYz5IDHlmneCZ1IJ+P3kYtQ=="
+        crossorigin=""/>
+
+  <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
+          integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
+          crossorigin=""></script>
+  <script src="https://unpkg.com/leaflet.markercluster@1.4.1/dist/leaflet.markercluster.js"
+          integrity="sha512-MQlyPV+ol2lp4KodaU/Xmrn+txc1TP15pOBF/2Sfre7MRsA/pB4Vy58bEqe9u7a7DczMLtU5wT8n7OblJepKbg=="
+          crossorigin=""></script>
diff --git a/bh20simplewebuploader/templates/search.html b/bh20simplewebuploader/templates/search.html
index dbdca90..e69de29 100644
--- a/bh20simplewebuploader/templates/search.html
+++ b/bh20simplewebuploader/templates/search.html
@@ -1,10 +0,0 @@
-<div class="search">
-  <input id="search-input" type="search" placeholder="FASTA uri" required>
-  <button class="button search-button" type="submit" onclick="search()">
-    <span class="icon ion-search">
-      <span class="sr-only">Search</span>
-    </span>
-  </button>
-  <span class="dropt" title="http://collections.lugli.arvadosapi.com/c=00fede2c6f52b053a14edca01cfa02b7+126/sequence.fasta">(example)<span style="width:500px;"></span></span>
-</div>
-
diff --git a/bh20simplewebuploader/templates/status.html b/bh20simplewebuploader/templates/status.html
index a1cf28f..e89437e 100644
--- a/bh20simplewebuploader/templates/status.html
+++ b/bh20simplewebuploader/templates/status.html
@@ -7,7 +7,8 @@
 
       <h1>Sequence upload processing status</h1>
 
-        <div class="status">
+      <div class="status">
+
 	  {{ table }}
         </div>
 
diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html
index 9f0987c..c2302fa 100644
--- a/bh20simplewebuploader/templates/success.html
+++ b/bh20simplewebuploader/templates/success.html
@@ -9,7 +9,7 @@
         <h1>Upload Successful</h1>
         <hr>
         <p>
-            Your files have been uploaded. They should soon appear as output of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
+            Your files have been uploaded. You can track their <a href="/status">QC status</a>, once validated they will be part of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
         </p>
         <p>
             The upload log was:
diff --git a/bh20simplewebuploader/templates/validated.html b/bh20simplewebuploader/templates/validated.html
new file mode 100644
index 0000000..cee94bd
--- /dev/null
+++ b/bh20simplewebuploader/templates/validated.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+  {% include 'header.html' %}
+    <body>
+      {% include 'banner.html' %}
+      {% include 'menu.html' %}
+
+      <h1>Validated sequences</h1>
+
+        <div class="status">
+	  {{ table }}
+        </div>
+
+{% include 'footer.html' %}
+
+   </body>
+</html>
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 6dcd72b..3b270dd 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -42,7 +42,7 @@ repository.
 
 ### Using the Web Uploader
 
-To run the web uploader in a GNU Guix environment/container
+To run the web uploader in a GNU Guix environment/container run it with something like
 
 ```
 guix environment guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic  nss-certs --network openssl -- env FLASK_ENV=development PYTHONPATH=$PYTHONPATH:./bh20sequploader FLASK_APP=bh20simplewebuploader/main.py flask run
@@ -59,7 +59,7 @@ WIP: add gunicorn container
 Currently the full webserver container deploy command looks like
 
 ```
-penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic  nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
-``
+penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$  env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-oinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic  nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
+```
 
 Note: see above on GUIX_PACKAGE_PATH.
diff --git a/doc/blog/using-covid-19-pubseq-part1.html b/doc/blog/using-covid-19-pubseq-part1.html
index 1959fac..0e6136c 100644
--- a/doc/blog/using-covid-19-pubseq-part1.html
+++ b/doc/blog/using-covid-19-pubseq-part1.html
@@ -3,7 +3,7 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2020-05-29 Fri 12:06 -->
+<!-- 2020-07-17 Fri 05:05 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>COVID-19 PubSeq (part 1)</title>
@@ -248,20 +248,20 @@ for the JavaScript code in this tag.
 <h2>Table of Contents</h2>
 <div id="text-table-of-contents">
 <ul>
-<li><a href="#org9afe6ab">1. What does this mean?</a></li>
-<li><a href="#orgf4bc3d4">2. Fetch sequence data</a></li>
-<li><a href="#org9d7d482">3. Predicates</a></li>
-<li><a href="#orgc6046bb">4. Fetch submitter info and other metadata</a></li>
-<li><a href="#orgdcb216b">5. Fetch all sequences from Washington state</a></li>
-<li><a href="#org7060f51">6. Discussion</a></li>
-<li><a href="#orgdc51ccc">7. Acknowledgements</a></li>
+<li><a href="#org0db5db0">1. What does this mean?</a></li>
+<li><a href="#orge5267fd">2. Fetch sequence data</a></li>
+<li><a href="#orgfbd3adc">3. Predicates</a></li>
+<li><a href="#org08e70e1">4. Fetch submitter info and other metadata</a></li>
+<li><a href="#org9194557">5. Fetch all sequences from Washington state</a></li>
+<li><a href="#org76317ad">6. Discussion</a></li>
+<li><a href="#orgeb871a1">7. Acknowledgements</a></li>
 </ul>
 </div>
 </div>
 
 
-<div id="outline-container-org9afe6ab" class="outline-2">
-<h2 id="org9afe6ab"><span class="section-number-2">1</span> What does this mean?</h2>
+<div id="outline-container-org0db5db0" class="outline-2">
+<h2 id="org0db5db0"><span class="section-number-2">1</span> What does this mean?</h2>
 <div class="outline-text-2" id="text-1">
 <p>
 This means that when someone uploads a SARS-CoV-2 sequence using one
@@ -274,24 +274,24 @@ expressed in a <a href="https://github.com/arvados/bh20-seq-resource/blob/master
   type: record
   fields:
     host_species:
-        doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_<span style="color: #8bc34a;">9606</span> for Homo sapiens
+        doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens
         type: string
         jsonldPredicate:
-          _id: http://www.ebi.ac.uk/efo/EFO_<span style="color: #8bc34a;">0000532</span>
-          _type: <span style="color: #9ccc65;">"@id"</span>
-          noLinkCheck: <span style="color: #8bc34a;">true</span>
+          _id: http://www.ebi.ac.uk/efo/EFO_0000532
+          _type: "@id"
+          noLinkCheck: true
     host_sex:
-        doc: Sex of the host as defined in PATO, expect male <span style="color: #e91e63;">()</span> or female <span style="color: #e91e63;">()</span>
+        doc: Sex of the host as defined in PATO, expect male () or female ()
         type: string?
         jsonldPredicate:
-          _id: http://purl.obolibrary.org/obo/PATO_<span style="color: #8bc34a;">0000047</span>
-          _type: <span style="color: #9ccc65;">"@id"</span>
-          noLinkCheck: <span style="color: #8bc34a;">true</span>
+          _id: http://purl.obolibrary.org/obo/PATO_0000047
+          _type: "@id"
+          noLinkCheck: true
     host_age:
-        doc: Age of the host as number <span style="color: #e91e63;">(</span>e.g. <span style="color: #8bc34a;">50</span><span style="color: #e91e63;">)</span>
+        doc: Age of the host as number (e.g. 50)
         type: int?
         jsonldPredicate:
-          _id: http://purl.obolibrary.org/obo/PATO_<span style="color: #8bc34a;">0000011</span>
+          _id: http://purl.obolibrary.org/obo/PATO_0000011
 </pre>
 </div>
 
@@ -314,8 +314,8 @@ initiative!
 </div>
 
 
-<div id="outline-container-orgf4bc3d4" class="outline-2">
-<h2 id="orgf4bc3d4"><span class="section-number-2">2</span> Fetch sequence data</h2>
+<div id="outline-container-orge5267fd" class="outline-2">
+<h2 id="orge5267fd"><span class="section-number-2">2</span> Fetch sequence data</h2>
 <div class="outline-text-2" id="text-2">
 <p>
 The latest run of the pipeline can be viewed <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">here</a>. Each of these
@@ -339,8 +339,8 @@ these identifiers throughout.
 </div>
 </div>
 
-<div id="outline-container-org9d7d482" class="outline-2">
-<h2 id="org9d7d482"><span class="section-number-2">3</span> Predicates</h2>
+<div id="outline-container-orgfbd3adc" class="outline-2">
+<h2 id="orgfbd3adc"><span class="section-number-2">3</span> Predicates</h2>
 <div class="outline-text-2" id="text-3">
 <p>
 To explore an RDF dataset, the first query we can do is open and gets
@@ -350,10 +350,10 @@ the following in a SPARQL end point
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?p
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">select distinct ?p
+{
    ?o ?p ?s
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -364,10 +364,10 @@ To get a <a href="http://sparql.genenetwork.org/sparql/?default-graph-uri=&amp;q
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?g
-<span style="color: #e91e63;">{</span>
-    GRAPH ?g <span style="color: #2196F3;">{</span>?s ?p ?o<span style="color: #2196F3;">}</span>
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select distinct ?g
+{
+    GRAPH ?g {?s ?p ?o}
+}
 </pre>
 </div>
 
@@ -383,10 +383,10 @@ To list all submitters, try
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?s
-<span style="color: #e91e63;">{</span>
-   ?o <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">#MainSchema/submitter&gt;</a> ?s
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select distinct ?s
+{
+   ?o &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/submitter&gt; ?s
+}
 </pre>
 </div>
 
@@ -397,11 +397,11 @@ and by
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?s
-<span style="color: #e91e63;">{</span>
-   ?o <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">#MainSchema/submitter&gt;</a> ?id .
+<pre class="src src-sql">select distinct ?s
+{
+   ?o &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/submitter&gt; ?id .
    ?id ?p ?s
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -415,12 +415,12 @@ To lift the full URL out of the query you can use a header like
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -438,32 +438,32 @@ Now we got this far, lets <a href="http://sparql.genenetwork.org/sparql/?default
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #e91e63;">(</span><span style="color: #ff8A65;">COUNT</span><span style="color: #2196F3;">(</span><span style="color: #fff59d;">distinct</span> ?dataset<span style="color: #2196F3;">)</span> <span style="color: #fff59d;">as</span> ?num<span style="color: #e91e63;">)</span>
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select (COUNT(distinct ?dataset) as ?num)
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 </div>
 </div>
 
 
-<div id="outline-container-orgc6046bb" class="outline-2">
-<h2 id="orgc6046bb"><span class="section-number-2">4</span> Fetch submitter info and other metadata</h2>
+<div id="outline-container-org08e70e1" class="outline-2">
+<h2 id="org08e70e1"><span class="section-number-2">4</span> Fetch submitter info and other metadata</h2>
 <div class="outline-text-2" id="text-4">
 <p>
 To get dataests with submitters we can do the above
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?p ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?p ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -480,13 +480,13 @@ Let's focus on one sample with
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter .
-   FILTER<span style="color: #2196F3;">(</span><span style="color: #fff59d;">CONTAINS</span><span style="color: #EF6C00;">(</span>?submitter,"Roychoudhury"<span style="color: #EF6C00;">)</span><span style="color: #2196F3;">)</span> .
-<span style="color: #e91e63;">}</span>
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+}
 </pre>
 </div>
 
@@ -496,12 +496,12 @@ see if we can get a sample ID by listing sample predicates
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?p
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?p
+{
    ?dataset ?p ?o .
    ?dataset pubseq:submitter ?id .
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -513,15 +513,15 @@ Let's zoom in on those of Roychoudhury with
 
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?sid ?sample ?p1 ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?sid ?sample ?p1 ?dataset ?submitter
+{
    ?dataset pubseq:submitter ?id .
    ?id ?p ?submitter .
-   FILTER<span style="color: #2196F3;">(</span><span style="color: #fff59d;">CONTAINS</span><span style="color: #EF6C00;">(</span>?submitter,"Roychoudhury"<span style="color: #EF6C00;">)</span><span style="color: #2196F3;">)</span> .
+   FILTER(CONTAINS(?submitter,"Roychoudhury")) .
    ?dataset pubseq:sample ?sid .
    ?sid ?p1 ?sample
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -532,18 +532,13 @@ this database. Let's focus on one sample "MT326090.1" with predicate
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">#MainSchema/&gt;
-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">PREFIX</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/"> sio: &lt;http://semanticscience.org/resource/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?sample ?p ?o
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+PREFIX sio: &lt;http://semanticscience.org/resource/&gt;
+select distinct ?sample ?p ?o
+{
    ?sample sio:SIO_000115 "MT326090.1" .
    ?sample ?p ?o .
-<span style="color: #e91e63;">}</span>
+}
 </pre>
 </div>
 
@@ -561,8 +556,8 @@ to view/query the database.
 </div>
 </div>
 
-<div id="outline-container-orgdcb216b" class="outline-2">
-<h2 id="orgdcb216b"><span class="section-number-2">5</span> Fetch all sequences from Washington state</h2>
+<div id="outline-container-org9194557" class="outline-2">
+<h2 id="org9194557"><span class="section-number-2">5</span> Fetch all sequences from Washington state</h2>
 <div class="outline-text-2" id="text-5">
 <p>
 Now we know how to get at the origin we can do it the other way round
@@ -570,15 +565,11 @@ and fetch all sequences referring to Washington state
 </p>
 
 <div class="org-src-container">
-<pre class="src src-sql">
-<span style="color: #fff59d;">select</span> ?seq ?sample
-<span style="color: #e91e63;">{</span>
-    ?seq <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
-    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
-    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
-    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">#MainSchema/sample&gt; ?sample .
-    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223&gt;</a>
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select ?seq ?sample
+{
+    ?seq &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
+    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223&gt;
+}
 </pre>
 </div>
 
@@ -586,11 +577,26 @@ and fetch all sequences referring to Washington state
 which lists 300 sequences originating from Washington state! Which is almost
 half of the set coming out of GenBank.
 </p>
+
+<p>
+Likewise to list all sequences from Turkey we can find the wikidata
+entity is <a href="https://www.wikidata.org/wiki/Q43">Q43</a>:
+</p>
+
+<div class="org-src-container">
+<pre class="src src-sql">select ?seq ?sample
+{
+    ?seq &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
+    ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q43&gt;
+}
+</pre>
 </div>
 </div>
+</div>
+
 
-<div id="outline-container-org7060f51" class="outline-2">
-<h2 id="org7060f51"><span class="section-number-2">6</span> Discussion</h2>
+<div id="outline-container-org76317ad" class="outline-2">
+<h2 id="org76317ad"><span class="section-number-2">6</span> Discussion</h2>
 <div class="outline-text-2" id="text-6">
 <p>
 The public sequence uploader collects sequences, raw data and
@@ -601,8 +607,8 @@ referenced in publications and origins are citeable.
 </div>
 </div>
 
-<div id="outline-container-orgdc51ccc" class="outline-2">
-<h2 id="orgdc51ccc"><span class="section-number-2">7</span> Acknowledgements</h2>
+<div id="outline-container-orgeb871a1" class="outline-2">
+<h2 id="orgeb871a1"><span class="section-number-2">7</span> Acknowledgements</h2>
 <div class="outline-text-2" id="text-7">
 <p>
 The overall effort was due to magnificent freely donated input by a
@@ -617,7 +623,7 @@ Garrison this initiative would not have existed!
 </div>
 </div>
 <div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-05-29 Fri 12:06</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-17 Fri 05:02</small>.
 </div>
 </body>
 </html>
diff --git a/doc/blog/using-covid-19-pubseq-part4.html b/doc/blog/using-covid-19-pubseq-part4.html
index b5a05ca..c975c21 100644
--- a/doc/blog/using-covid-19-pubseq-part4.html
+++ b/doc/blog/using-covid-19-pubseq-part4.html
@@ -3,7 +3,7 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2020-07-12 Sun 06:24 -->
+<!-- 2020-07-17 Fri 05:04 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>COVID-19 PubSeq (part 4)</title>
@@ -161,6 +161,19 @@
   .footdef  { margin-bottom: 1em; }
   .figure { padding: 1em; }
   .figure p { text-align: center; }
+  .equation-container {
+    display: table;
+    text-align: center;
+    width: 100%;
+  }
+  .equation {
+    vertical-align: middle;
+  }
+  .equation-label {
+    display: table-cell;
+    text-align: right;
+    vertical-align: middle;
+  }
   .inlinetask {
     padding: 10px;
     border: 2px solid gray;
@@ -186,7 +199,7 @@
 @licstart  The following is the entire license notice for the
 JavaScript code in this tag.
 
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
+Copyright (C) 2012-2020 Free Software Foundation, Inc.
 
 The JavaScript code in this tag is free software: you can
 redistribute it and/or modify it under the terms of the GNU
@@ -235,15 +248,16 @@ for the JavaScript code in this tag.
 <h2>Table of Contents</h2>
 <div id="text-table-of-contents">
 <ul>
-<li><a href="#org8f8b64a">1. What does this mean?</a></li>
-<li><a href="#orgcc7a403">2. Modify Workflow</a></li>
+<li><a href="#orgc2ee09f">1. What does this mean?</a></li>
+<li><a href="#org0d37881">2. Where can I find the workflows?</a></li>
+<li><a href="#orgddb0531">3. Modify Workflow</a></li>
 </ul>
 </div>
 </div>
 
 
-<div id="outline-container-org8f8b64a" class="outline-2">
-<h2 id="org8f8b64a"><span class="section-number-2">1</span> What does this mean?</h2>
+<div id="outline-container-orgc2ee09f" class="outline-2">
+<h2 id="orgc2ee09f"><span class="section-number-2">1</span> What does this mean?</h2>
 <div class="outline-text-2" id="text-1">
 <p>
 This means that when someone uploads a SARS-CoV-2 sequence using one
@@ -253,18 +267,28 @@ which triggers a rerun of our workflows.
 </div>
 </div>
 
-
-<div id="outline-container-orgcc7a403" class="outline-2">
-<h2 id="orgcc7a403"><span class="section-number-2">2</span> Modify Workflow</h2>
+<div id="outline-container-org0d37881" class="outline-2">
+<h2 id="org0d37881"><span class="section-number-2">2</span> Where can I find the workflows?</h2>
 <div class="outline-text-2" id="text-2">
 <p>
+Workflows are written in the common workflow language (CWL) and listed
+on <a href="https://github.com/arvados/bh20-seq-resource/tree/master/workflows">github</a>. PubSeq being an open project these workflows can be studied
+and modified!
+</p>
+</div>
+</div>
+
+<div id="outline-container-orgddb0531" class="outline-2">
+<h2 id="orgddb0531"><span class="section-number-2">3</span> Modify Workflow</h2>
+<div class="outline-text-2" id="text-3">
+<p>
 <i>Work in progress!</i>
 </p>
 </div>
 </div>
 </div>
 <div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-12 Sun 06:24</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-17 Fri 01:47</small>.
 </div>
 </body>
 </html>
diff --git a/doc/blog/using-covid-19-pubseq-part4.org b/doc/blog/using-covid-19-pubseq-part4.org
index 5fe71d1..8ad5e2d 100644
--- a/doc/blog/using-covid-19-pubseq-part4.org
+++ b/doc/blog/using-covid-19-pubseq-part4.org
@@ -10,6 +10,7 @@
 
 * Table of Contents                                                     :TOC:noexport:
  - [[#what-does-this-mean][What does this mean?]]
+ - [[#where-can-i-find-the-workflows][Where can I find the workflows?]]
  - [[#modify-workflow][Modify Workflow]]
 
 * What does this mean?
@@ -18,6 +19,11 @@ This means that when someone uploads a SARS-CoV-2 sequence using one
 of our tools (CLI or web-based) they add a sequence and some metadata
 which triggers a rerun of our workflows.
 
+* Where can I find the workflows?
+
+Workflows are written in the common workflow language (CWL) and listed
+on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]]. PubSeq being an open project these workflows can be studied
+and modified!
 
 * Modify Workflow
 
diff --git a/doc/blog/using-covid-19-pubseq-part5.html b/doc/blog/using-covid-19-pubseq-part5.html
index 80bf559..4caa5ac 100644
--- a/doc/blog/using-covid-19-pubseq-part5.html
+++ b/doc/blog/using-covid-19-pubseq-part5.html
@@ -3,7 +3,7 @@
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
 <head>
-<!-- 2020-07-12 Sun 06:24 -->
+<!-- 2020-07-17 Fri 05:03 -->
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
 <meta name="viewport" content="width=device-width, initial-scale=1" />
 <title>COVID-19 PubSeq (part 4)</title>
@@ -161,6 +161,19 @@
   .footdef  { margin-bottom: 1em; }
   .figure { padding: 1em; }
   .figure p { text-align: center; }
+  .equation-container {
+    display: table;
+    text-align: center;
+    width: 100%;
+  }
+  .equation {
+    vertical-align: middle;
+  }
+  .equation-label {
+    display: table-cell;
+    text-align: right;
+    vertical-align: middle;
+  }
   .inlinetask {
     padding: 10px;
     border: 2px solid gray;
@@ -186,7 +199,7 @@
 @licstart  The following is the entire license notice for the
 JavaScript code in this tag.
 
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
+Copyright (C) 2012-2020 Free Software Foundation, Inc.
 
 The JavaScript code in this tag is free software: you can
 redistribute it and/or modify it under the terms of the GNU
@@ -235,38 +248,40 @@ for the JavaScript code in this tag.
 <h2>Table of Contents</h2>
 <div id="text-table-of-contents">
 <ul>
-<li><a href="#org871ad58">1. Modify Metadata</a></li>
-<li><a href="#org07e8755">2. What is the schema?</a></li>
-<li><a href="#org4857280">3. How is the website generated?</a></li>
-<li><a href="#orge709ae2">4. Modifying the schema</a></li>
+<li><a href="#org758b923">1. Modify Metadata</a></li>
+<li><a href="#orgec32c13">2. What is the schema?</a></li>
+<li><a href="#org2e487b2">3. How is the website generated?</a></li>
+<li><a href="#orge4dfe84">4. Modifying the schema</a></li>
+<li><a href="#org564a7a8">5. Adding fields to the form</a></li>
+<li><a href="#org633781a">6. <span class="todo TODO">TODO</span> Testing the license fields</a></li>
 </ul>
 </div>
 </div>
 
 
-<div id="outline-container-org871ad58" class="outline-2">
-<h2 id="org871ad58"><span class="section-number-2">1</span> Modify Metadata</h2>
+<div id="outline-container-org758b923" class="outline-2">
+<h2 id="org758b923"><span class="section-number-2">1</span> Modify Metadata</h2>
 <div class="outline-text-2" id="text-1">
 <p>
 The public sequence resource uses multiple data formats listed on the
-<a href="./download">DOWNLOAD</a> page. One of the most exciting features is the full support
+<a href="http://covid19.genenetwork.org/download">download</a> page. One of the most exciting features is the full support
 for RDF and semantic web/linked data ontologies. This technology
 allows for querying data in unprescribed ways - that is, you can
 formulate your own queries without dealing with a preset model of that
 data (so typical of CSV files and SQL tables). Examples of exploring
-data are listed <a href="./blog?id=using-covid-19-pubseq-part1">here</a>.
+data are listed <a href="http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1">here</a>.
 </p>
 
 <p>
 In this BLOG we are going to look at the metadata entered on the
-<a href="./">COVID-19 PubSeq</a> website (or command line client). It is important to
+COVID-19 PubSeq website (or command line client). It is important to
 understand that anyone, including you, can change that information!
 </p>
 </div>
 </div>
 
-<div id="outline-container-org07e8755" class="outline-2">
-<h2 id="org07e8755"><span class="section-number-2">2</span> What is the schema?</h2>
+<div id="outline-container-orgec32c13" class="outline-2">
+<h2 id="orgec32c13"><span class="section-number-2">2</span> What is the schema?</h2>
 <div class="outline-text-2" id="text-2">
 <p>
 The default metadata schema is listed <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml">here</a>.
@@ -274,8 +289,8 @@ The default metadata schema is listed <a href="https://github.com/arvados/bh20-s
 </div>
 </div>
 
-<div id="outline-container-org4857280" class="outline-2">
-<h2 id="org4857280"><span class="section-number-2">3</span> How is the website generated?</h2>
+<div id="outline-container-org2e487b2" class="outline-2">
+<h2 id="org2e487b2"><span class="section-number-2">3</span> How is the website generated?</h2>
 <div class="outline-text-2" id="text-3">
 <p>
 Using the schema we use <a href="https://pypi.org/project/PyShEx/">pyshex</a> shex expressions and <a href="https://github.com/common-workflow-language/schema_salad">schema salad</a> to
@@ -285,13 +300,13 @@ All from that one metadata schema.
 </div>
 </div>
 
-<div id="outline-container-orge709ae2" class="outline-2">
-<h2 id="orge709ae2"><span class="section-number-2">4</span> Modifying the schema</h2>
+<div id="outline-container-orge4dfe84" class="outline-2">
+<h2 id="orge4dfe84"><span class="section-number-2">4</span> Modifying the schema</h2>
 <div class="outline-text-2" id="text-4">
 <p>
-One of the first things we wanted to do is to add a field for the data
-license. Initially we only support CC-4.0 as a license by default, but
-now we want to give uploaders the option to make it an even more
+One of the first things we want to do is to add a field for the data
+license. Initially we only supported CC-4.0 as a license, but
+we wanted to give uploaders the option to use an even more
 liberal CC0 license. The first step is to find a good ontology term
 for the field. Searching for `creative commons cc0 rdf' rendered this
 useful <a href="https://creativecommons.org/ns">page</a>.  We also find an <a href="https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview">overview</a> where CC0 is represented as URI
@@ -302,13 +317,148 @@ attributionName and attributionURL.
 </p>
 
 <p>
-<i>Note: work in progress</i>
+A minimal triple should be
+</p>
+
+<pre class="example">
+id  xhtml:license  &lt;http://creativecommons.org/licenses/by/4.0/&gt; .
+</pre>
+
+
+<p>
+Other suggestions are
+</p>
+
+<pre class="example">
+id  dc:title "Description" .
+id  cc:attributionName "Your Name" .
+id  cc:attributionURL &lt;http://resource.org/id&gt;
+</pre>
+
+
+<p>
+and 'dc:source' which indicates the original source of any modified
+work, specified as a URI.
+The prefix 'cc:' is an abbreviation for <a href="http://creativecommons.org/ns">http://creativecommons.org/ns</a>#.
+</p>
+
+<p>
+Going back to the schema, where does it fit? Under host, sample,
+virus, technology or submitter block? It could fit under sample, but
+actually the license concerns the whole metadata block and sequence,
+so I think we can fit under its own license tag. For example
+</p>
+
+
+<p>
+id: placeholder
+</p>
+
+<pre class="example">
+license:
+    license_type: http://creativecommons.org/licenses/by/4.0/
+    attribution_title: "Sample ID"
+    attribution_name: "John doe, Joe Boe, Jonny Oe"
+    attribution_url: http://covid19.genenetwork.org/id
+    attribution_source: https://www.ncbi.nlm.nih.gov/pubmed/323088888
+</pre>
+
+
+<p>
+So, let's update the example. Notice the license info is optional - if it is missing
+we just assume the default CC-4.0.
+</p>
+
+<p>
+One thing that is interesting is that in the name space <a href="https://creativecommons.org/ns">https://creativecommons.org/ns</a> there
+is no mention of a title. I think it is useful, however, because we have no such field.
+So, we'll add it simply as a title field. Now the draft schema is
 </p>
+
+<div class="org-src-container">
+<pre class="src src-js">- name: licenseSchema
+  type: record
+  fields:
+    license_type:
+      doc: License types as refined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#License
+    title:
+      doc: Attribution title related to license
+      type: string?
+      jsonldPredicate:
+          _id: http://semanticscience.org/resource/SIO_001167
+    attribution_url:
+      doc: Attribution URL related to license
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#Work
+    attribution_source:
+      doc: Attribution source URL
+      type: string?
+      jsonldPredicate:
+          _id: https://creativecommons.org/ns#Work
+</pre>
+</div>
+
+<p>
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted <a href="https://github.com/arvados/bh20-seq-resource/pull/97">here</a> and reviewed on the
+gitter channel and I merged it.
+</p>
+</div>
 </div>
+
+<div id="outline-container-org564a7a8" class="outline-2">
+<h2 id="org564a7a8"><span class="section-number-2">5</span> Adding fields to the form</h2>
+<div class="outline-text-2" id="text-5">
+<p>
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in <a href="https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229">main.py</a> in the 'generate<sub>form</sub>' function.
+</p>
+
+<p>
+With this <a href="https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3">patch</a> the website adds the license input fields on the form.
+</p>
+
+<p>
+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This
+was done with this <a href="https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc">patch</a>. In the end we decided to use the Dublin core title,
+<a href="http://purl.org/metadata/dublin_core_elements#Title">http://purl.org/metadata/dublin_core_elements#Title</a>:
+</p>
+
+<div class="org-src-container">
+<pre class="src src-js">:licenseShape{
+    cc:License xsd:string;
+    dc:Title xsd:string ?;
+    cc:attributionName xsd:string ?;
+    cc:attributionURL xsd:string ?;
+    cc:attributionSource xsd:string ?;
+}
+</pre>
+</div>
+
+<p>
+Note that cc:AttributionSource is not really defined in the cc standard.
+</p>
+
+<p>
+When pushing the license info we discovered the workflow broke because
+the existing data had no licensing info. So we changed the license
+field to be optional - a missing license assumes it is CC-BY-4.0.
+</p>
+</div>
+</div>
+
+<div id="outline-container-org633781a" class="outline-2">
+<h2 id="org633781a"><span class="section-number-2">6</span> <span class="todo TODO">TODO</span> Testing the license fields</h2>
 </div>
 </div>
 <div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-12 Sun 06:24</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-16 Thu 03:27</small>.
 </div>
 </body>
 </html>
diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org
index 4b0ea64..78eea66 100644
--- a/doc/blog/using-covid-19-pubseq-part5.org
+++ b/doc/blog/using-covid-19-pubseq-part5.org
@@ -13,19 +13,21 @@
  - [[#what-is-the-schema][What is the schema?]]
  - [[#how-is-the-website-generated][How is the website generated?]]
  - [[#modifying-the-schema][Modifying the schema]]
+ - [[#adding-fields-to-the-form][Adding fields to the form]]
+ - [[#testing-the-license-fields][Testing the license fields]]
 
 * Modify Metadata
 
 The public sequence resource uses multiple data formats listed on the
-[[./download][DOWNLOAD]] page. One of the most exciting features is the full support
+[[http://covid19.genenetwork.org/download][download]] page. One of the most exciting features is the full support
 for RDF and semantic web/linked data ontologies. This technology
 allows for querying data in unprescribed ways - that is, you can
 formulate your own queries without dealing with a preset model of that
 data (so typical of CSV files and SQL tables). Examples of exploring
-data are listed [[./blog?id=using-covid-19-pubseq-part1][here]].
+data are listed [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][here]].
 
 In this BLOG we are going to look at the metadata entered on the
-[[./][COVID-19 PubSeq]] website (or command line client). It is important to
+COVID-19 PubSeq website (or command line client). It is important to
 understand that anyone, including you, can change that information!
 
 * What is the schema?
@@ -41,8 +43,8 @@ All from that one metadata schema.
 * Modifying the schema
 
 One of the first things we want to do is to add a field for the data
-license. Initially we only support CC-4.0 as a license by default, but
-now we want to give uploaders the option to make it an even more
+license. Initially we only supported CC-4.0 as a license, but
+we wanted to give uploaders the option to use an even more
 liberal CC0 license. The first step is to find a good ontology term
 for the field. Searching for `creative commons cc0 rdf' rendered this
 useful [[https://creativecommons.org/ns][page]].  We also find an [[https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview][overview]] where CC0 is represented as URI
@@ -113,8 +115,37 @@ So, we'll add it simply as a title field. Now the draft schema is
           _id: https://creativecommons.org/ns#Work
 #+END_SRC
 
-Now, we are no ontology experts, right? So, next we submit a patch to our source tree and
-ask for feedback before wiring it up in the data entry form. The pull request was
-submitted here FIXME.
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted [[https://github.com/arvados/bh20-seq-resource/pull/97][here]] and reviewed on the
+gitter channel and I merged it.
 
-/Note: work in progress/
+* Adding fields to the form
+
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function.
+
+With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form.
+
+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This
+was done with this [[https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc][patch]]. In the end we decided to use the Dublin core title,
+http://purl.org/metadata/dublin_core_elements#Title:
+
+#+BEGIN_SRC js
+:licenseShape{
+    cc:License xsd:string;
+    dc:Title xsd:string ?;
+    cc:attributionName xsd:string ?;
+    cc:attributionURL xsd:string ?;
+    cc:attributionSource xsd:string ?;
+}
+#+END_SRC
+
+Note that cc:AttributionSource is not really defined in the cc standard.
+
+When pushing the license info we discovered the workflow broke because
+the existing data had no licensing info. So we changed the license
+field to be optional - a missing license assumes it is CC-BY-4.0.
+
+* TODO Testing the license fields
diff --git a/doc/web/about.org b/doc/web/about.org
index ad13bc3..1949e2d 100644
--- a/doc/web/about.org
+++ b/doc/web/about.org
@@ -140,7 +140,8 @@ See the [[http://covid19.genenetwork.org/blog]]!
 
 * How do I change the work flows?
 
-See the [[http://covid19.genenetwork.org/blog]]!
+Workflows are on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]] and can be modified. See also the
+[[[[http://covid19.genenetwork.org/blog]]][workflow blog]].
 
 * How do I change the source code?
 
diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml
index 51f8a87..1b46cc7 100644
--- a/example/minimal_metadata_example.yaml
+++ b/example/minimal_metadata_example.yaml
@@ -1,5 +1,9 @@
 id: placeholder
 
+
+license:
+    license_type: http://creativecommons.org/licenses/by/4.0/
+
 host:
     host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
 
@@ -15,4 +19,4 @@ technology:
     sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
 
 submitter:
-    authors: [John Doe]
\ No newline at end of file
+    authors: [John Doe]
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
new file mode 100644
index 0000000..78f34c8
--- /dev/null
+++ b/scripts/cleanup.py
@@ -0,0 +1,41 @@
+import arvados
+import arvados.util
+
+api = arvados.api()
+
+delete_patterns = [
+    "%missing%`collection_location`%",
+    "%missing%`technology`%",
+    "%missing%`host_species`%",
+    "%QC fail: alignment%",
+    "%does not look like a valid URI%",
+    "%Duplicate of%",
+    "%No matching triples found for predicate obo:NCIT_C42781%",
+    "%does not look like a valid URI%"
+    ]
+
+revalidate_patterns = [
+    "%missing%`license`%",
+    "%QC fail%"
+]
+
+for p in delete_patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        api.collections().delete(uuid=i["uuid"]).execute()
+
+for p in revalidate_patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        pr = i["properties"]
+        if "status" in pr:
+            del pr["status"]
+        if "errors" in pr:
+            del pr["errors"]
+        api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
deleted file mode 100644
index 88acb18..0000000
--- a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
+++ /dev/null
Binary files differdiff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
new file mode 100644
index 0000000..93ef550
--- /dev/null
+++ b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
Binary files differdiff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index ef0d119..352a30e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -8,7 +8,7 @@ import gzip
 
 dir_yaml = 'yaml'
 
-date = '2020.07.05'
+date = '2020.07.09'
 
 # Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
 # Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
@@ -50,13 +50,14 @@ sra_metadata_xml_file.close()
 EXPERIMENT_PACKAGE_SET = tree.getroot()
 
 missing_value_list = []
+not_created_accession_list = []
 
 run_accession_set = set()
 run_accession_to_downloadble_file_url_dict = {}
 
 for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     #print(i, EXPERIMENT_PACKAGE)
-    
+
     # A general default-empty yaml could be read from the definitive one
     info_for_yaml_dict = {
         'id': 'placeholder',
@@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     #print(accession)
 
     info_for_yaml_dict['sample']['sample_id'] = accession
-    
+
     #SRAFiles = RUN.find('SRAFiles')
     #if SRAFiles is not None:
     #    url = SRAFiles.find('SRAFile').attrib['url']
     #    if 'sra-download.ncbi.nlm.nih.gov' in url:
     #        run_accession_to_downloadble_file_url_dict[accession] = url
-  
+
 
     SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
     SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
-    
+
     for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
         VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
         if VALUE is not None:
@@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                     missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
             elif TAG_text in ['host_health_status', 'host health state']:
                 if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]                            
+                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
                 elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
                     missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
             elif TAG_text in ['strain', 'isolate']:
@@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
                     if value_to_insert in term_to_uri_dict:
                         value_to_insert = term_to_uri_dict[value_to_insert]
-                        
-                    if 'virus_strain' not in info_for_yaml_dict:                        
+
+                    if 'virus_strain' not in info_for_yaml_dict:
                         info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
                     else:
                         info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
-            elif TAG_text in ['isolation_source', 'isolation source host-associated']:                    
+            elif TAG_text in ['isolation_source', 'isolation source host-associated']:
                 if VALUE_text in term_to_uri_dict:
                     info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
                 else:
@@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
             elif TAG_text == 'collected_by':
                 if VALUE_text.lower() not in ['not available', 'missing']:
                     name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
-                    
+
                     info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
             elif TAG_text == 'collecting institution':
                 if VALUE_text.lower() not in ['not provided', 'na']:
@@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                 if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
                     date_to_write = VALUE_text
                     date_is_estimated = True
-                    
+
                     VALUE_text_list = VALUE_text.split('-')
                     if len(VALUE_text_list) == 3:
                         date_is_estimated = False
-                        
+
                         if VALUE_text_list[1].isalpha():
                             date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
                     elif len(VALUE_text_list) == 2:
@@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                             date_to_write = "{}-01-15".format(VALUE_text)
 
                     info_for_yaml_dict['sample']['collection_date'] = date_to_write
-                    
+
                     if date_is_estimated:
                         if 'additional_collection_information' in info_for_yaml_dict['sample']:
                             info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
@@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
     taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
     info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
-    
-    
+
+
     EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
     INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
 
@@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
     SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
     info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
-    
+
     if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
         info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
 
-    STUDY = EXPERIMENT_PACKAGE.find('STUDY')     
+    STUDY = EXPERIMENT_PACKAGE.find('STUDY')
     info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
-    
-    
+
+
     Organization = EXPERIMENT_PACKAGE.find('Organization')
     Organization_Name = Organization.find('Name')
     info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
-        
+
     Organization_Contact = Organization.find('Contact')
     if Organization_Contact is not None:
         Organization_Contact_Name = Organization_Contact.find('Name')
@@ -231,20 +232,33 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     Organization_Address = Organization.find('Address')
     if Organization_Address is not None:
         info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
-    
+
     if 'collection_date' not in info_for_yaml_dict['sample']:
         info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
         info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
 
     if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-        print(accession, ' - technology not found')
+        #print(accession, ' - technology not found')
+        not_created_accession_list.append([accession, 'technology not found'])
+        continue
+
+    if 'host_species' not in info_for_yaml_dict['host']:
+        #print(accession, ' - technology not found')
+        not_created_accession_list.append([accession, 'missing host species'])
         continue
 
     with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
         json.dump(info_for_yaml_dict, fw, indent=2)
-    
+
 if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
+    path_missing_terms_tsv = 'missing_terms.sra.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
+
+if len(not_created_accession_list) > 0:
+    path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
+    print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+    with open(path_not_created_accession_tsv, 'w') as fw:
+        fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 40572a3..0bfc455 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
 num_seq_with_len_ge_X_bp = 0
 
 missing_value_list = []
+not_created_accession_list = []
 accession_with_errors_list = []
 
 for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
 
             if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-                print(accession_version, ' - technology not found')
+                #print(accession_version, ' - technology not found')
+                not_created_accession_list.append([accession_version, 'technology not found'])
                 continue
 
             with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -389,15 +391,21 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
             continue
 
 if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
+    path_missing_terms_tsv = 'missing_terms.genbank.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
 
 if len(accession_with_errors_list) > 0:
-    path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+    path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
     print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
     with open(path_accession_with_errors_tsv, 'w') as fw:
         fw.write('\n'.join(accession_with_errors_list))
 
+if len(not_created_accession_list) > 0:
+    path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
+    print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+    with open(path_not_created_accession_tsv, 'w') as fw:
+        fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
 print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
author	Pjotr Prins	2020-07-17 11:08:15 +0100
committer	Pjotr Prins	2020-07-17 11:08:15 +0100
commit	16bb5df907c79cd0ce6bea0015821a2ce51fb992 (patch)
tree	ddb9677cddcc463bb514300189cbd4300b9117ed
parent	0be9983ef88fd3b925d8fa53e7f9ab2a28703bc0 (diff)
parent	c69046ee9a5e24eadcd8cb885633328b0fd88011 (diff)
download	bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.gz bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.lz bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.zip