aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2020-07-17 11:08:15 +0100
committerPjotr Prins2020-07-17 11:08:15 +0100
commit16bb5df907c79cd0ce6bea0015821a2ce51fb992 (patch)
treeddb9677cddcc463bb514300189cbd4300b9117ed
parent0be9983ef88fd3b925d8fa53e7f9ab2a28703bc0 (diff)
parentc69046ee9a5e24eadcd8cb885633328b0fd88011 (diff)
downloadbh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.gz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.lz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.zip
Merge branch 'master' into ebi-submit
-rw-r--r--bh20seqanalyzer/main.py618
-rw-r--r--bh20sequploader/bh20seq-schema.yml15
-rw-r--r--bh20sequploader/bh20seq-shex.rdf25
-rw-r--r--bh20sequploader/main.py22
-rw-r--r--bh20sequploader/qc_fasta.py4
-rw-r--r--bh20simplewebuploader/main.py242
-rw-r--r--bh20simplewebuploader/static/blog.css17
-rw-r--r--bh20simplewebuploader/static/image/CWL.pngbin0 -> 11066 bytes
-rw-r--r--bh20simplewebuploader/static/image/ESR.pngbin0 -> 67869 bytes
-rw-r--r--bh20simplewebuploader/static/image/curii.logo.ai.pngbin0 -> 16739 bytes
-rw-r--r--bh20simplewebuploader/static/image/curii.logo.ai.svg3
-rw-r--r--bh20simplewebuploader/static/image/edit.pngbin0 -> 2452 bytes
-rw-r--r--bh20simplewebuploader/static/main.css38
-rw-r--r--bh20simplewebuploader/static/main.js151
-rw-r--r--bh20simplewebuploader/static/map.js50
-rw-r--r--bh20simplewebuploader/templates/blog.html4
-rw-r--r--bh20simplewebuploader/templates/blurb.html2
-rw-r--r--bh20simplewebuploader/templates/demo-run.html26
-rw-r--r--bh20simplewebuploader/templates/demo.html49
-rw-r--r--bh20simplewebuploader/templates/error.html2
-rw-r--r--bh20simplewebuploader/templates/footer.html12
-rw-r--r--bh20simplewebuploader/templates/header.html18
-rw-r--r--bh20simplewebuploader/templates/home.html15
-rw-r--r--bh20simplewebuploader/templates/map.html33
-rw-r--r--bh20simplewebuploader/templates/mapheader.html16
-rw-r--r--bh20simplewebuploader/templates/search.html10
-rw-r--r--bh20simplewebuploader/templates/status.html3
-rw-r--r--bh20simplewebuploader/templates/success.html2
-rw-r--r--bh20simplewebuploader/templates/validated.html17
-rw-r--r--doc/INSTALL.md6
-rw-r--r--doc/blog/using-covid-19-pubseq-part1.html192
-rw-r--r--doc/blog/using-covid-19-pubseq-part4.html44
-rw-r--r--doc/blog/using-covid-19-pubseq-part4.org6
-rw-r--r--doc/blog/using-covid-19-pubseq-part5.html194
-rw-r--r--doc/blog/using-covid-19-pubseq-part5.org49
-rw-r--r--doc/web/about.org3
-rw-r--r--example/minimal_metadata_example.yaml6
-rw-r--r--scripts/cleanup.py41
-rw-r--r--scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gzbin6502056 -> 0 bytes
-rw-r--r--scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gzbin0 -> 9744133 bytes
-rw-r--r--scripts/create_sra_metadata/create_sra_metadata.py62
-rw-r--r--scripts/dict_ontology_standardization/ncbi_host_species.csv1
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py14
43 files changed, 1227 insertions, 785 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 0b52e6b..b3a439d 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -16,277 +16,308 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%
level=logging.INFO)
logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN)
-def validate_upload(api, collection, validated_project,
- fastq_project, fastq_workflow_uuid,
- revalidate):
- col = arvados.collection.Collection(collection["uuid"])
-
- if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
- return False
-
- # validate the collection here. Check metadata, etc.
- logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
-
- errors = []
-
- if collection["owner_uuid"] != validated_project:
- dup = api.collections().list(filters=[["owner_uuid", "=", validated_project],
- ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
- if dup["items"]:
- # This exact collection has been uploaded before.
- errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
-
- if not errors:
- if "metadata.yaml" not in col:
- errors.append("Missing metadata.yaml", collection["name"])
- else:
+class SeqAnalyzer:
+
+ def __init__(self, api, keepclient,
+ uploader_project,
+ pangenome_analysis_project,
+ fastq_project,
+ validated_project,
+ workflow_def_project,
+ pangenome_workflow_uuid,
+ fastq_workflow_uuid,
+ exclude_list,
+ latest_result_collection):
+ self.api = api
+ self.keepclient = keepclient
+ self.uploader_project = uploader_project
+ self.pangenome_analysis_project = pangenome_analysis_project
+ self.fastq_project = fastq_project
+ self.validated_project = validated_project
+ self.workflow_def_project = workflow_def_project
+ self.pangenome_workflow_uuid = pangenome_workflow_uuid
+ self.fastq_workflow_uuid = fastq_workflow_uuid
+ self.exclude_list = exclude_list
+ self.latest_result_uuid = latest_result_collection
+ self.schema_ref = None
+
+ def validate_upload(self, collection, revalidate):
+ col = arvados.collection.Collection(collection["uuid"], api_client=self.api, keep_client=self.keepclient)
+
+ if not revalidate and collection["properties"].get("status") in ("validated", "rejected"):
+ return False
+
+ # validate the collection here. Check metadata, etc.
+ logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"]))
+
+ errors = []
+
+ if collection["owner_uuid"] != self.validated_project:
+ dup = self.api.collections().list(filters=[["owner_uuid", "=", self.validated_project],
+ ["portable_data_hash", "=", col.portable_data_hash()]]).execute()
+ if dup["items"]:
+ # This exact collection has been uploaded before.
+ errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+
+ if not errors:
+ if "metadata.yaml" not in col:
+ errors.append("Missing metadata.yaml", collection["name"])
+ else:
+ try:
+ with col.open("metadata.yaml") as md:
+ metadata_content = ruamel.yaml.round_trip_load(md)
+ metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
+ sample_id = metadata_content["sample"]["sample_id"]
+ add_lc_filename(metadata_content, metadata_content["id"])
+ valid = qc_metadata(metadata_content)
+ if not valid:
+ errors.append("Failed metadata qc")
+ except Exception as e:
+ errors.append(str(e))
+
+ if not errors:
try:
- metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml"))
- metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"]
- sample_id = metadata_content["sample"]["sample_id"]
- add_lc_filename(metadata_content, metadata_content["id"])
- valid = qc_metadata(metadata_content)
- if not valid:
- errors.append("Failed metadata qc")
- except Exception as e:
- errors.append(str(e))
-
- if not errors:
- try:
- tgt = None
- paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
- for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
- if n not in col:
- continue
- with col.open(n, 'rb') as qf:
- tgt = qc_fasta(qf)[0]
- if tgt != n and tgt != paired.get(n):
- errors.append("Expected %s but magic says it should be %s", n, tgt)
- elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
- start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id)
- return False
- if tgt is None:
- errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
- except Exception as v:
- errors.append(str(v))
-
-
- if not errors:
- # Move it to the "validated" project to be included in the next analysis
- if "errors" in collection["properties"]:
- del collection["properties"]["errors"]
- collection["properties"]["status"] = "validated"
- api.collections().update(uuid=collection["uuid"], body={
- "owner_uuid": validated_project,
- "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
- "properties": collection["properties"]}).execute()
- logging.info("Added '%s' to validated sequences" % collection["name"])
- return True
- else:
- # It is invalid
- logging.warn("'%s' (%s) has validation errors: %s" % (
- collection["name"], collection["uuid"], "\n".join(errors)))
- collection["properties"]["status"] = "rejected"
- collection["properties"]["errors"] = errors
- api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
- return False
-
-
-def run_workflow(api, parent_project, workflow_uuid, name, inputobj):
- project = api.groups().create(body={
- "group_class": "project",
- "name": name,
- "owner_uuid": parent_project,
- }, ensure_unique_name=True).execute()
-
- with tempfile.NamedTemporaryFile() as tmp:
- tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
- tmp.flush()
- cmd = ["arvados-cwl-runner",
- "--submit",
- "--no-wait",
- "--project-uuid=%s" % project["uuid"],
- "arvwf:%s" % workflow_uuid,
- tmp.name]
- logging.info("Running %s" % ' '.join(cmd))
- comp = subprocess.run(cmd, capture_output=True)
- logging.info("Submitted %s", comp.stdout)
- if comp.returncode != 0:
- logging.error(comp.stderr.decode('utf-8'))
-
- return project
-
-
-def start_fastq_to_fasta(api, collection,
- analysis_project,
- fastq_workflow_uuid,
- tgt,
- sample_id):
-
- params = {
- "metadata": {
- "class": "File",
- "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
- },
- "ref_fasta": {
- "class": "File",
- "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
- },
- "sample_id": sample_id
- }
-
- if tgt.startswith("reads.fastq"):
- params["fastq_forward"] = {
- "class": "File",
- "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
- }
- elif tgt.startswith("reads_1.fastq"):
- params["fastq_forward"] = {
- "class": "File",
- "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
- }
- params["fastq_reverse"] = {
- "class": "File",
- "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+ tgt = None
+ paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"}
+ for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+ if n not in col:
+ continue
+ with col.open(n, 'rb') as qf:
+ tgt = qc_fasta(qf)[0]
+ if tgt != n and tgt != paired.get(n):
+ errors.append("Expected %s but magic says it should be %s", n, tgt)
+ elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
+ self.start_fastq_to_fasta(collection, n, sample_id)
+ return False
+ if tgt is None:
+ errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
+ except Exception as v:
+ errors.append(str(v))
+
+
+ if not errors:
+ # Move it to the "validated" project to be included in the next analysis
+ if "errors" in collection["properties"]:
+ del collection["properties"]["errors"]
+ collection["properties"]["status"] = "validated"
+ self.api.collections().update(uuid=collection["uuid"], body={
+ "owner_uuid": self.validated_project,
+ "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())),
+ "properties": collection["properties"]}).execute()
+ logging.info("Added '%s' to validated sequences" % collection["name"])
+ return True
+ else:
+ # It is invalid
+ logging.warn("'%s' (%s) has validation errors: %s" % (
+ collection["name"], collection["uuid"], "\n".join(errors)))
+ collection["properties"]["status"] = "rejected"
+ collection["properties"]["errors"] = errors
+ self.api.collections().update(uuid=collection["uuid"], body={"properties": collection["properties"]}).execute()
+ return False
+
+
+ def run_workflow(self, parent_project, workflow_uuid, name, inputobj):
+ project = self.api.groups().create(body={
+ "group_class": "project",
+ "name": name,
+ "owner_uuid": parent_project,
+ }, ensure_unique_name=True).execute()
+
+ with tempfile.NamedTemporaryFile() as tmp:
+ tmp.write(json.dumps(inputobj, indent=2).encode('utf-8'))
+ tmp.flush()
+ cmd = ["arvados-cwl-runner",
+ "--submit",
+ "--no-wait",
+ "--project-uuid=%s" % project["uuid"],
+ "arvwf:%s" % workflow_uuid,
+ tmp.name]
+ logging.info("Running %s" % ' '.join(cmd))
+ comp = subprocess.run(cmd, capture_output=True)
+ logging.info("Submitted %s", comp.stdout)
+ if comp.returncode != 0:
+ logging.error(comp.stderr.decode('utf-8'))
+
+ return project
+
+
+ def start_fastq_to_fasta(self, collection,
+ tgt,
+ sample_id):
+
+ params = {
+ "metadata": {
+ "class": "File",
+ "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"]
+ },
+ "ref_fasta": {
+ "class": "File",
+ "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta"
+ },
+ "sample_id": sample_id
}
- newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", params)
- api.collections().update(uuid=collection["uuid"],
- body={"owner_uuid": newproject["uuid"]}).execute()
-
-def start_pangenome_analysis(api,
- analysis_project,
- pangenome_workflow_uuid,
- validated_project,
- schema_ref,
- exclude_list):
- validated = arvados.util.list_all(api.collections().list, filters=[
- ["owner_uuid", "=", validated_project],
- ["properties.status", "=", "validated"]])
- inputobj = {
- "inputReads": [],
- "metadata": [],
- "subjects": [],
- "metadataSchema": {
- "class": "File",
- "location": schema_ref
- },
- "exclude": {
- "class": "File",
- "location": exclude_list
+ if tgt.startswith("reads.fastq"):
+ params["fastq_forward"] = {
+ "class": "File",
+ "location": "keep:%s/%s" % (collection["portable_data_hash"], tgt)
+ }
+ elif tgt.startswith("reads_1.fastq"):
+ params["fastq_forward"] = {
+ "class": "File",
+ "location": "keep:%s/reads_1.%s" % (collection["portable_data_hash"], tgt[8:])
+ }
+ params["fastq_reverse"] = {
+ "class": "File",
+ "location": "keep:%s/reads_2.%s" % (collection["portable_data_hash"], tgt[8:])
+ }
+
+ newproject = self.run_workflow(self.fastq_project, self.fastq_workflow_uuid, "FASTQ to FASTA", params)
+ self.api.collections().update(uuid=collection["uuid"],
+ body={"owner_uuid": newproject["uuid"]}).execute()
+
+ def start_pangenome_analysis(self):
+
+ if self.schema_ref is None:
+ self.upload_schema()
+
+ validated = arvados.util.list_all(self.api.collections().list, filters=[
+ ["owner_uuid", "=", self.validated_project],
+ ["properties.status", "=", "validated"]])
+ inputobj = {
+ "inputReads": [],
+ "metadata": [],
+ "subjects": [],
+ "metadataSchema": {
+ "class": "File",
+ "location": self.schema_ref
+ },
+ "exclude": {
+ "class": "File",
+ "location": self.exclude_list
+ }
}
- }
- validated.sort(key=lambda v: v["portable_data_hash"])
- for v in validated:
- inputobj["inputReads"].append({
- "class": "File",
- "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
- })
- inputobj["metadata"].append({
- "class": "File",
- "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
- })
- inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
- run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj)
-
-
-def get_workflow_output_from_project(api, uuid):
- cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid],
- ["requesting_container_uuid", "=", None]]).execute()
- if cr["items"] and cr["items"][0]["output_uuid"]:
- container = api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
- if container["state"] == "Complete" and container["exit_code"] == 0:
- return cr["items"][0]
- return None
-
-
-def copy_most_recent_result(api, analysis_project, latest_result_uuid):
- most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]],
- order="created_at desc").execute()
- for m in most_recent_analysis["items"]:
- wf = get_workflow_output_from_project(api, m["uuid"])
- if wf:
- src = api.collections().get(uuid=wf["output_uuid"]).execute()
- dst = api.collections().get(uuid=latest_result_uuid).execute()
- if src["portable_data_hash"] != dst["portable_data_hash"]:
- logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid)
- api.collections().update(uuid=latest_result_uuid,
- body={"manifest_text": src["manifest_text"],
- "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
- break
-
+ validated.sort(key=lambda v: v["portable_data_hash"])
+ for v in validated:
+ inputobj["inputReads"].append({
+ "class": "File",
+ "location": "keep:%s/sequence.fasta" % v["portable_data_hash"]
+ })
+ inputobj["metadata"].append({
+ "class": "File",
+ "location": "keep:%s/metadata.yaml" % v["portable_data_hash"]
+ })
+ inputobj["subjects"].append("http://collections.lugli.arvadosapi.com/c=%s/sequence.fasta" % v["portable_data_hash"])
+ self.run_workflow(self.pangenome_analysis_project, self.pangenome_workflow_uuid, "Pangenome analysis", inputobj)
+
+
+ def get_workflow_output_from_project(self, uuid):
+ cr = self.api.container_requests().list(filters=[['owner_uuid', '=', uuid],
+ ["requesting_container_uuid", "=", None]]).execute()
+ if cr["items"] and cr["items"][0]["output_uuid"]:
+ container = self.api.containers().get(uuid=cr["items"][0]["container_uuid"]).execute()
+ if container["state"] == "Complete" and container["exit_code"] == 0:
+ return cr["items"][0]
+ return None
+
+
+ def copy_most_recent_result(self):
+ most_recent_analysis = self.api.groups().list(filters=[['owner_uuid', '=', self.pangenome_analysis_project]],
+ order="created_at desc").execute()
+ for m in most_recent_analysis["items"]:
+ wf = self.get_workflow_output_from_project(m["uuid"])
+ if wf:
+ src = self.api.collections().get(uuid=wf["output_uuid"]).execute()
+ dst = self.api.collections().get(uuid=self.latest_result_uuid).execute()
+ if src["portable_data_hash"] != dst["portable_data_hash"]:
+ logging.info("Copying latest result from '%s' to %s", m["name"], self.latest_result_uuid)
+ self.api.collections().update(uuid=self.latest_result_uuid,
+ body={"manifest_text": src["manifest_text"],
+ "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute()
+ break
+
+
+ def move_fastq_to_fasta_results(self):
+ projects = self.api.groups().list(filters=[['owner_uuid', '=', self.fastq_project],
+ ["properties.moved_output", "!=", True]],
+ order="created_at asc",).execute()
+ for p in projects["items"]:
+ wf = self.get_workflow_output_from_project(p["uuid"])
+ if not wf:
+ continue
-def move_fastq_to_fasta_results(api, analysis_project, uploader_project):
- projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project],
- ["properties.moved_output", "!=", True]],
- order="created_at desc",).execute()
- for p in projects["items"]:
- wf = get_workflow_output_from_project(api, p["uuid"])
- if wf:
logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"])
- api.collections().update(uuid=wf["output_uuid"],
- body={"owner_uuid": uploader_project}).execute()
- p["properties"]["moved_output"] = True
- api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
- break
+ col = arvados.collection.Collection(wf["output_uuid"], api_client=self.api, keep_client=self.keepclient)
+ with col.open("metadata.yaml") as md:
+ metadata_content = ruamel.yaml.round_trip_load(md)
+
+ colprop = col.get_properties()
+ colprop["sequence_label"] = metadata_content["sample"]["sample_id"]
+ self.api.collections().update(uuid=wf["output_uuid"],
+ body={"owner_uuid": self.uploader_project,
+ "properties": colprop}).execute()
-def upload_schema(api, workflow_def_project):
- schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
- c = arvados.collection.Collection()
- with c.open("schema.yml", "wb") as f:
- f.write(schema_resource.read())
- pdh = c.portable_data_hash()
- wd = api.collections().list(filters=[["owner_uuid", "=", workflow_def_project],
- ["portable_data_hash", "=", pdh]]).execute()
- if len(wd["items"]) == 0:
- c.save_new(owner_uuid=workflow_def_project, name="Metadata schema", ensure_unique_name=True)
- return "keep:%s/schema.yml" % pdh
-
-
-def print_status(api, uploader_project, fmt):
- pending = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", uploader_project]])
- out = []
- status = {}
- for p in pending:
- prop = p["properties"]
- out.append(prop)
- if "status" not in prop:
- prop["status"] = "pending"
- prop["created_at"] = p["created_at"]
- prop["uuid"] = p["uuid"]
- status[prop["status"]] = status.get(prop["status"], 0) + 1
- if fmt == "html":
- print(
-"""
-<html>
-<body>
-""")
- print("<p>Total collections in upload project %s</p>" % len(out))
- print("<p>Status %s</p>" % status)
- print(
-"""
-<table>
-<tr><th>Collection</th>
-<th>Sequence label</th>
-<th>Status</th>
-<th>Errors</th></tr>
-""")
- for r in out:
- print("<tr valign='top'>")
- print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
- print("<td>%s</td>" % r["sequence_label"])
- print("<td>%s</td>" % r["status"])
- print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
- print("</tr>")
- print(
-"""
-</table>
-</body>
-</html>
-""")
- else:
- print(json.dumps(out, indent=2))
+ p["properties"]["moved_output"] = True
+ self.api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute()
+
+
+ def upload_schema(self):
+ schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml")
+ c = arvados.collection.Collection(api_client=self.api, keep_client=self.keepclient)
+ with c.open("schema.yml", "wb") as f:
+ f.write(schema_resource.read())
+ pdh = c.portable_data_hash()
+ wd = self.api.collections().list(filters=[["owner_uuid", "=", self.workflow_def_project],
+ ["portable_data_hash", "=", pdh]]).execute()
+ if len(wd["items"]) == 0:
+ c.save_new(owner_uuid=self.workflow_def_project, name="Metadata schema", ensure_unique_name=True)
+ self.schema_ref = "keep:%s/schema.yml" % pdh
+
+
+ def print_status(self, fmt):
+ pending = arvados.util.list_all(self.api.collections().list, filters=[["owner_uuid", "=", self.uploader_project]])
+ out = []
+ status = {}
+ for p in pending:
+ prop = p["properties"]
+ out.append(prop)
+ if "status" not in prop:
+ prop["status"] = "pending"
+ prop["created_at"] = p["created_at"]
+ prop["uuid"] = p["uuid"]
+ status[prop["status"]] = status.get(prop["status"], 0) + 1
+ if fmt == "html":
+ print(
+ """
+ <html>
+ <body>
+ """)
+ print("<p>Total collections in upload project %s</p>" % len(out))
+ print("<p>Status %s</p>" % status)
+ print(
+ """
+ <table>
+ <tr><th>Collection</th>
+ <th>Sequence label</th>
+ <th>Status</th>
+ <th>Errors</th></tr>
+ """)
+ for r in out:
+ print("<tr valign='top'>")
+ print("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+ print("<td>%s</td>" % r["sequence_label"])
+ print("<td>%s</td>" % r["status"])
+ print("<td><pre>%s</pre></td>" % "\n".join(r.get("errors", [])))
+ print("</tr>")
+ print(
+ """
+ </table>
+ </body>
+ </html>
+ """)
+ else:
+ print(json.dumps(out, indent=2))
def main():
parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project')
@@ -310,50 +341,45 @@ def main():
args = parser.parse_args()
api = arvados.api()
-
-
-
- schema_ref = upload_schema(api, args.workflow_def_project)
+ keepclient = arvados.keep.KeepClient(api_client=api)
+
+ seqanalyzer = SeqAnalyzer(api, keepclient,
+ args.uploader_project,
+ args.pangenome_analysis_project,
+ args.fastq_project,
+ args.validated_project,
+ args.workflow_def_project,
+ args.pangenome_workflow_uuid,
+ args.fastq_workflow_uuid,
+ args.exclude_list,
+ args.latest_result_collection)
if args.kickoff:
logging.info("Starting a single analysis run")
- start_pangenome_analysis(api,
- args.pangenome_analysis_project,
- args.pangenome_workflow_uuid,
- args.validated_project,
- schema_ref,
- args.exclude_list)
+ seqanalyzer.start_pangenome_analysis()
return
if args.print_status:
- print_status(api, args.uploader_project, args.print_status)
+ seqanalyzer.print_status(args.print_status)
exit(0)
logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project))
while True:
- move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project)
-
- new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
- at_least_one_new_valid_seq = False
- for c in new_collections:
- at_least_one_new_valid_seq = validate_upload(api, c,
- args.validated_project,
- args.fastq_project,
- args.fastq_workflow_uuid,
- args.revalidate) or at_least_one_new_valid_seq
-
- if at_least_one_new_valid_seq and not args.no_start_analysis:
- start_pangenome_analysis(api,
- args.pangenome_analysis_project,
- args.pangenome_workflow_uuid,
- args.validated_project,
- schema_ref,
- args.exclude_list)
-
- copy_most_recent_result(api,
- args.pangenome_analysis_project,
- args.latest_result_collection)
+ try:
+ seqanalyzer.move_fastq_to_fasta_results()
+
+ new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
+ at_least_one_new_valid_seq = False
+ for c in new_collections:
+ at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq
+
+ if at_least_one_new_valid_seq and not args.no_start_analysis:
+ seqanalyzer.start_pangenome_analysis()
+
+ seqanalyzer.copy_most_recent_result()
+ except Exception as e:
+ logging.exeception("Error in main loop")
if args.once:
break
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml
index b3d4d12..0aead3b 100644
--- a/bh20sequploader/bh20seq-schema.yml
+++ b/bh20sequploader/bh20seq-schema.yml
@@ -1,6 +1,7 @@
$base: http://biohackathon.org/bh20-seq-schema
$namespaces:
cc: http://creativecommons.org/ns#
+ dc: http://purl.org/metadata/dublin_core_elements#
sch: https://schema.org/
efo: http://www.ebi.ac.uk/efo/
obo: http://purl.obolibrary.org/obo/
@@ -15,24 +16,29 @@ $graph:
fields:
license_type:
doc: License types as defined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
- type: string?
+ type: string
jsonldPredicate:
_id: https://creativecommons.org/ns#License
title:
doc: Attribution title related to data license
type: string?
jsonldPredicate:
- _id: http://semanticscience.org/resource/SIO_001167
+ _id: http://purl.org/metadata/dublin_core_elements#Title
+ attribution_name:
+ doc: Attribution NAME related to data license
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#attributionName
attribution_url:
doc: Attribution URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionURL
attribution_source:
doc: Attribution source URL related to data license
type: string?
jsonldPredicate:
- _id: https://creativecommons.org/ns#Work
+ _id: https://creativecommons.org/ns#attributionSource
- name: hostSchema
type: record
@@ -258,6 +264,7 @@ $graph:
virus: virusSchema
technology: technologySchema
submitter: submitterSchema
+ license: ["null", licenseSchema]
id:
doc: The subject (eg the fasta/fastq file) that the metadata describes
type: string
diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf
index 965229c..bbc7309 100644
--- a/bh20sequploader/bh20seq-shex.rdf
+++ b/bh20sequploader/bh20seq-shex.rdf
@@ -1,6 +1,8 @@
PREFIX : <https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#>
PREFIX MainSchema: <http://biohackathon.org/bh20-seq-schema#MainSchema/>
PREFIX hostSchema: <http://biohackathon.org/bh20-seq-schema#hostSchema/>
+PREFIX cc: <http://creativecommons.org/ns#>
+PREFIX dc: <http://purl.org/metadata/dublin_core_elements#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX sio: <http://semanticscience.org/resource/>
@@ -15,10 +17,11 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
MainSchema:submitter @:submitterShape ;
MainSchema:technology @:technologyShape ;
MainSchema:virus @:virusShape;
+ MainSchema:license @:licenseShape ?;
}
:hostShape {
- efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
+ efo:EFO_0000532 [ obo:NCBITaxon_~ ] ;
sio:SIO_000115 xsd:string ?;
obo:PATO_0000047 [ obo:PATO_0000384 obo:PATO_0000383 obo:PATO_0001340] ?;
obo:PATO_0000011 xsd:integer ?;
@@ -32,14 +35,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
:sampleShape {
sio:SIO_000115 xsd:string;
- evs:C25164 xsd:string;
- obo:GAZ_00000448 [wikidata:~] ;
+ evs:C25164 xsd:string;
+ obo:GAZ_00000448 [wikidata:~] ;
obo:OBI_0001895 xsd:string ?;
obo:NCIT_C41206 xsd:string ?;
obo:OBI_0001479 IRI {0,2};
obo:OBI_0001472 xsd:string ?;
sio:SIO_001167 xsd:string ?;
- edam:data_2091 IRI {0,3};
+ edam:data_2091 IRI {0,3};
}
:submitterShape {
@@ -47,7 +50,7 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
sio:SIO_000116 xsd:string *;
sio:SIO_000172 xsd:string ?;
obo:NCIT_C37984 xsd:string ?;
- obo:NCIT_C37900 xsd:string ?;
+ obo:NCIT_C37900 xsd:string ?;
efo:EFO_0001741 xsd:string ?;
obo:NCIT_C42781 xsd:string ?;
obo:NCIT_C19026 xsd:string ?;
@@ -63,6 +66,14 @@ PREFIX wikidata: <http://www.wikidata.org/entity/>
}
:virusShape{
- edam:data_1875 [ obo:NCBITaxon_~ ] ;
- sio:SIO_010055 xsd:string ?;
+ edam:data_1875 [ obo:NCBITaxon_~ ] ;
+ sio:SIO_010055 xsd:string ?;
}
+
+:licenseShape{
+ cc:License xsd:string ;
+ dc:Title xsd:string ?;
+ cc:attributionName xsd:string ?;
+ cc:attributionURL xsd:string ?;
+ cc:attributionSource xsd:string ?;
+} \ No newline at end of file
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index f744a8c..6049bf9 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -29,11 +29,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
try:
log.debug("Checking metadata" if do_qc else "Skipping metadata check")
if do_qc and not qc_metadata(metadata.name):
- log.warning("Failed metadata qc")
+ log.warning("Failed metadata QC")
failed = True
except Exception as e:
- log.debug(e)
- print(e)
+ log.exception("Failed metadata QC")
failed = True
target = []
@@ -45,8 +44,7 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
target[0] = ("reads_1."+target[0][0][6:], target[0][1])
target[1] = ("reads_2."+target[1][0][6:], target[0][1])
except Exception as e:
- log.debug(e)
- print(e)
+ log.exception("Failed sequence QC")
failed = True
if failed:
@@ -82,7 +80,7 @@ def main():
seqlabel = target[0][1]
if args.validate:
- print("Valid")
+ log.info("Valid")
exit(0)
col = arvados.collection.Collection(api_client=api)
@@ -91,10 +89,10 @@ def main():
if args.sequence_p2:
upload_sequence(col, target[1], args.sequence_p2)
- print("Reading metadata")
+ log.info("Reading metadata")
with col.open("metadata.yaml", "w") as f:
r = args.metadata.read(65536)
- print(r[0:20])
+ log.info(r[0:20])
while r:
f.write(r)
r = args.metadata.read(65536)
@@ -118,7 +116,7 @@ def main():
["portable_data_hash", "=", col.portable_data_hash()]]).execute()
if dup["items"]:
# This exact collection has been uploaded before.
- print("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
+ log.error("Duplicate of %s" % ([d["uuid"] for d in dup["items"]]))
exit(1)
if args.trusted:
@@ -131,9 +129,9 @@ def main():
(seqlabel, properties['upload_user'], properties['upload_ip']),
properties=properties, ensure_unique_name=True)
- print("Saved to %s" % col.manifest_locator())
-
- print("Done")
+ log.info("Saved to %s" % col.manifest_locator())
+ log.info("Done")
+ exit(0)
if __name__ == "__main__":
main()
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 37eb4e8..0c7e16d 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True):
except Exception as e:
logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
- if similarity and similarity < 70.0:
+ if similarity < 70.0:
raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity))
- if similarity == 0:
- raise ValueError("QC fail")
return ("sequence.fasta"+gz, seqlabel)
elif seq_type == "text/fastq":
diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py
index 77e345b..206f884 100644
--- a/bh20simplewebuploader/main.py
+++ b/bh20simplewebuploader/main.py
@@ -8,7 +8,7 @@ import os
import sys
import re
import string
-import yaml
+import ruamel.yaml as yaml
import pkg_resources
from flask import Flask, request, redirect, send_file, send_from_directory, render_template, jsonify
import os.path
@@ -16,6 +16,9 @@ import requests
import io
import arvados
from markupsafe import Markup
+from schema_salad.sourceline import add_lc_filename
+from schema_salad.schema import shortname
+from typing import MutableSequence, MutableMapping
ARVADOS_API = 'lugli.arvadosapi.com'
ANONYMOUS_TOKEN = '5o42qdxpxp5cj15jqjf7vnxx5xduhm4ret703suuoa3ivfglfh'
@@ -47,6 +50,9 @@ def type_to_heading(type_name):
Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading.
"""
+ type_name = shortname(type_name)
+
+ print(type_name,file=sys.stderr)
# Remove camel case
decamel = re.sub('([A-Z])', r' \1', type_name)
# Split
@@ -77,7 +83,7 @@ def is_iri(string):
return string.startswith('http')
-def generate_form(schema, options):
+def generate_form(components, options):
"""
Linearize the schema into a list of dicts.
@@ -100,9 +106,6 @@ def generate_form(schema, options):
IRI.
"""
- # Get the list of form components, one of which is the root
- components = schema.get('$graph', [])
-
# Find the root
root_name = None
# And also index components by type name
@@ -130,55 +133,54 @@ def generate_form(schema, options):
# First make a heading, if we aren't the very root of the form
yield {'heading': type_to_heading(type_name)}
- for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items():
+ for field in by_name.get(type_name, {}).get('fields', []):
+ field_name = shortname(field["name"])
+ field_type = field["type"]
# For each field
ref_iri = None
docstring = None
- if not isinstance(field_type, str):
- # If the type isn't a string
-
- # It may have documentation
- docstring = field_type.get('doc', None)
-
- # See if it has a more info/what goes here URL
- predicate = field_type.get('jsonldPredicate', {})
- # Predicate may be a URL, a dict with a URL in _id, maybe a
- # dict with a URL in _type, or a dict with _id and _type but no
- # URLs anywhere. Some of these may not technically be allowed
- # by the format, but if they occur, we might as well try to
- # handle them.
- if isinstance(predicate, str):
- if is_iri(predicate):
- ref_iri = predicate
- else:
- # Assume it's a dict. Look at the fields we know about.
- for field in ['_id', 'type']:
- field_value = predicate.get(field, None)
- if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
- # Take the first URL-looking thing we find
- ref_iri = field_value
- break
-
- # Now overwrite the field type with the actual type string
- field_type = field_type.get('type', '')
-
- # Decide if the field is optional (type ends in ?)
optional = False
- if field_type.endswith('?'):
- # It's optional
- optional = True
- # Drop the ?
- field_type = field_type[:-1]
-
- # Decide if the field is a list (type ends in [])
is_list = False
- if field_type.endswith('[]'):
- # It's a list
- is_list = True
- # Reduce to the normal type
- field_type = field_type[:-2]
+
+ # It may have documentation
+ docstring = field.get('doc', None)
+
+ # See if it has a more info/what goes here URL
+ predicate = field.get('jsonldPredicate', {})
+ # Predicate may be a URL, a dict with a URL in _id, maybe a
+ # dict with a URL in _type, or a dict with _id and _type but no
+ # URLs anywhere. Some of these may not technically be allowed
+ # by the format, but if they occur, we might as well try to
+ # handle them.
+ if isinstance(predicate, str):
+ if is_iri(predicate):
+ ref_iri = predicate
+ else:
+ # Assume it's a dict. Look at the fields we know about.
+ for field in ['_id', 'type']:
+ field_value = predicate.get(field, None)
+ if isinstance(field_value, str) and is_iri(field_value) and ref_iri is None:
+ # Take the first URL-looking thing we find
+ ref_iri = field_value
+ break
+
+ if isinstance(field_type, MutableSequence):
+ if field_type[0] == "null" and len(field_type) == 2:
+ optional = True
+ field_type = field_type[1]
+ else:
+ raise Exception("Can't handle it")
+
+ if isinstance(field_type, MutableMapping):
+ if field_type["type"] == "array":
+ # Now replace the field type with the actual type string
+ is_list = True
+ field_type = field_type.get('items', '')
+ else:
+ field_type = field_type.get('type', '')
+ pass
if field_type in by_name:
# This is a subrecord. We need to recurse
@@ -226,10 +228,24 @@ def generate_form(schema, options):
return list(walk_fields(root_name))
-# At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
-METADATA_SCHEMA = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
-METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
-FORM_ITEMS = generate_form(METADATA_SCHEMA, METADATA_OPTION_DEFINITIONS)
+import schema_salad.schema
+def load_schema_generate_form():
+ # At startup, we need to load the metadata schema from the uploader module, so we can make a form for it
+ if os.path.isfile("bh20sequploader/bh20seq-schema.yml"):
+ METADATA_SCHEMA = yaml.round_trip_load(open("bh20sequploader/bh20seq-schema.yml","r").read())
+ METADATA_OPTION_DEFINITIONS = yaml.safe_load(open("bh20sequploader/bh20seq-options.yml","r").read())
+ else:
+ METADATA_SCHEMA = yaml.round_trip_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml"))
+ METADATA_OPTION_DEFINITIONS = yaml.safe_load(pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml"))
+
+ METADATA_SCHEMA["name"] = "bh20seq-schema.yml"
+ add_lc_filename(METADATA_SCHEMA, "bh20seq-schema.yml")
+ metaschema_names, _metaschema_doc, metaschema_loader = schema_salad.schema.get_metaschema()
+ schema_doc, schema_metadata = metaschema_loader.resolve_ref(METADATA_SCHEMA, "")
+
+ return generate_form(schema_doc, METADATA_OPTION_DEFINITIONS)
+
+FORM_ITEMS = load_schema_generate_form()
@app.route('/')
def send_home():
@@ -237,7 +253,7 @@ def send_home():
Send the front page.
"""
- return render_template('home.html', menu='HOME')
+ return render_template('home.html', menu='HOME', load_map=True)
@app.route('/upload')
@@ -429,17 +445,21 @@ def receive_files():
if result.returncode != 0:
# It didn't work. Complain.
- error_message="Uploader returned value {} and said:".format(result.returncode) + str(result.stderr.decode('utf-8'))
+ error_message="Uploader returned value {} and said:\n".format(result.returncode) + str(result.stderr.decode('utf-8'))
print(error_message, file=sys.stderr)
return (render_template('error.html', error_message=error_message), 403)
else:
# It worked. Say so.
- return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace'))
+ return render_template('success.html', log=result.stderr.decode('utf-8', errors='replace'))
finally:
shutil.rmtree(dest_dir)
-def get_html_body(fn):
- buf = ""
+
+def edit_button(url,text="Edit text!"):
+ return '<p class="editbutton"><a href="'+url+'">'+text+'<img src="static/image/edit.png"></a></p>'
+
+def get_html_body(fn,source="https://github.com/arvados/bh20-seq-resource/tree/master/doc"):
+ buf = edit_button(source)
in_body = False
begin_body = re.compile(r"<body>",re.IGNORECASE)
end_body = re.compile(r"(</body>|.*=\"postamble\")",re.IGNORECASE)
@@ -451,11 +471,12 @@ def get_html_body(fn):
buf += line
elif begin_body.match(line):
in_body = True
+ buf += edit_button(source)
return buf
@app.route('/download')
def download_page():
- buf = get_html_body('doc/web/download.html')
+ buf = get_html_body('doc/web/download.html','https://github.com/arvados/bh20-seq-resource/blob/master/doc/web/download.org')
return render_template('resource.html',menu='DOWNLOAD',embed=buf)
def pending_table(output, items):
@@ -468,10 +489,13 @@ def pending_table(output, items):
for r in items:
if r["status"] != "pending":
continue
- output.write("<tr>")
- output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
- output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
- output.write("</tr>")
+ try:
+ output.write("<tr>")
+ output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+ output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+ output.write("</tr>")
+ except:
+ pass
output.write(
"""
</table>
@@ -486,18 +510,69 @@ def rejected_table(output, items):
<th>Errors</th></tr>
""")
for r in items:
- if r["status"] != "rejected":
- continue
+ try:
+ if r["status"] != "rejected":
+ continue
+ output.write("<tr>")
+ output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+ output.write("<td>%s</td>" % Markup.escape(r.get("sequence_label")))
+ output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+ output.write("</tr>")
+ except:
+ pass
+ output.write(
+"""
+</table>
+""")
+
+def workflows_table(output, items):
+ output.write(
+"""
+<table>
+<tr>
+<th>Name</th>
+<th>Sample id</th>
+<th>Started</th>
+<th>Container request</th>
+</tr>
+""")
+ for r in items:
output.write("<tr>")
- output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
- output.write("<td>%s</td>" % Markup.escape(r["sequence_label"]))
- output.write("<td><pre>%s</pre></td>" % Markup.escape("\n".join(r.get("errors", []))))
+ try:
+ sid = r["mounts"]["/var/lib/cwl/cwl.input.json"]["content"]["sample_id"]
+ output.write("<td>%s</td>" % Markup.escape(r["name"]))
+ output.write("<td>%s</td>" % Markup.escape(sid))
+ output.write("<td>%s</td>" % Markup.escape(r["created_at"]))
+ output.write("<td><a href='https://workbench.lugli.arvadosapi.com/container_requests/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+ except:
+ pass
output.write("</tr>")
output.write(
"""
</table>
""")
+def validated_table(output, items):
+ output.write(
+"""
+<table>
+<tr>
+<th>Collection</th>
+<th>Sequence label</th>
+</tr>
+""")
+ for r in items:
+ try:
+ output.write("<tr>")
+ output.write("<td><a href='https://workbench.lugli.arvadosapi.com/collections/%s'>%s</a></td>" % (r["uuid"], r["uuid"]))
+ output.write("<td>%s</td>" % Markup.escape(r["properties"].get("sequence_label")))
+ output.write("</tr>")
+ except:
+ pass
+ output.write(
+"""
+</table>
+""")
@app.route('/status')
def status_page():
@@ -518,46 +593,57 @@ def status_page():
prop["uuid"] = p["uuid"]
status[prop["status"]] = status.get(prop["status"], 0) + 1
+ workflows = arvados.util.list_all(api.container_requests().list,
+ filters=[["name", "in", ["fastq2fasta.cwl"]], ["state", "=", "Committed"]],
+ order="created_at asc")
+
output = io.StringIO()
validated = api.collections().list(filters=[["owner_uuid", "=", VALIDATED_PROJECT]], limit=1).execute()
status["passed"] = validated["items_available"]
- for s in (("passed", "/download"), ("pending", "#pending"), ("rejected", "#rejected")):
+ for s in (("passed", "/validated"), ("pending", "#pending"), ("rejected", "#rejected")):
output.write("<p><a href='%s'>%s sequences QC %s</a></p>" % (s[1], status.get(s[0], 0), s[0]))
- output.write("<a id='pending'><h1>Pending</h1>")
+ output.write("<p><a href='%s'>%s analysis workflows running</a></p>" % ('#workflows', len(workflows)))
+
+ output.write("<a id='pending'><h1>Pending</h1></a>")
pending_table(output, out)
- output.write("<a id='rejected'><h1>Rejected</h1>")
+ output.write("<a id='rejected'><h1>Rejected</h1></a>")
rejected_table(output, out)
+ output.write("<a id='workflows'><h1>Running Workflows</h1></a>")
+ workflows_table(output, workflows)
+
return render_template('status.html', table=Markup(output.getvalue()), menu='STATUS')
+@app.route('/validated')
+def validated_page():
+ api = arvados.api(host=ARVADOS_API, token=ANONYMOUS_TOKEN, insecure=True)
+ output = io.StringIO()
+ validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", VALIDATED_PROJECT]])
+ validated_table(output, validated)
+ return render_template('validated.html', table=Markup(output.getvalue()), menu='STATUS')
+
@app.route('/demo')
def demo_page():
- return render_template('demo.html',menu='DEMO')
+ return render_template('demo.html',menu='DEMO',load_map=True)
@app.route('/blog',methods=['GET'])
def blog_page():
blog_content = request.args.get('id') # e.g. using-covid-19-pubseq-part3
buf = None;
if blog_content:
- buf = get_html_body('doc/blog/'+blog_content+'.html')
+ buf = get_html_body('doc/blog/'+blog_content+'.html',"https://github.com/arvados/bh20-seq-resource/blob/master/doc/blog/"+blog_content+".org")
return render_template('blog.html',menu='BLOG',embed=buf,blog=blog_content)
@app.route('/about')
def about_page():
- buf = get_html_body('doc/web/about.html')
+ buf = get_html_body('doc/web/about.html','https://github.com/arvados/bh20-seq-resource/blob/master/doc/web/about.org')
return render_template('about.html',menu='ABOUT',embed=buf)
-##
-@app.route('/map')
-def map_page():
- return render_template('map.html',menu='DEMO')
-
-
## Dynamic API functions starting here
## This is quick and dirty for now, just to get something out and demonstrate the queries
diff --git a/bh20simplewebuploader/static/blog.css b/bh20simplewebuploader/static/blog.css
index 82a7c51..3ee8c44 100644
--- a/bh20simplewebuploader/static/blog.css
+++ b/bh20simplewebuploader/static/blog.css
@@ -4,12 +4,13 @@
.timestamp { font-family: monospace; color: darkgreen; }
h1,h2 { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace; color:black;background-color:#F0F8FF; }
-h2 { color: darkblue; }
-h3,h4 { color: black; }
+h2 { color: black; }
+h3,h4 { color: black; margin:0; }
code { color: darkblue; }
-body {font-family: Palatino, 'Palatino Linotype', serif; color:black;background-color:#e5d8f0; font-size: large }
+body {font-family: Palatino, 'Palatino Linotype', serif; color:black; background-color:white; font-size: large }
-div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset; font-family: palatino font, monospace; font-size:80%; font-weight:bold; }
+div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset;
+ font-family: palatino font, monospace; font-size:80%; font-weight:bold; }
div.quote { font-family: palatino font, monospace; font-size:80%; }
div.quotation { font-family: palatino font, monospace; font-size:80%; }
pre.example { margin: 30px; font-family: prestige, monospace; color:black; font-size:70%; background-color: lightyellow; }
@@ -23,10 +24,10 @@ div[id="text-table-of-contents"]{
}
div[class^="outline-text"] {
margin: 10px;
- background-color:#ebe6f0;
- border-style: dotted;
- border-color: #98bf21;
- border-width: 1px;
+ // background-color:white;
+ // border-style: dotted;
+ // border-color: #98bf21;
+ // border-width: 1px;
font-family: Palatino, 'Palatino Linotype', serif; color:black; font-size: large
}
span[class="todo TESTING"] {
diff --git a/bh20simplewebuploader/static/image/CWL.png b/bh20simplewebuploader/static/image/CWL.png
new file mode 100644
index 0000000..81d1807
--- /dev/null
+++ b/bh20simplewebuploader/static/image/CWL.png
Binary files differ
diff --git a/bh20simplewebuploader/static/image/ESR.png b/bh20simplewebuploader/static/image/ESR.png
new file mode 100644
index 0000000..557c798
--- /dev/null
+++ b/bh20simplewebuploader/static/image/ESR.png
Binary files differ
diff --git a/bh20simplewebuploader/static/image/curii.logo.ai.png b/bh20simplewebuploader/static/image/curii.logo.ai.png
new file mode 100644
index 0000000..401afad
--- /dev/null
+++ b/bh20simplewebuploader/static/image/curii.logo.ai.png
Binary files differ
diff --git a/bh20simplewebuploader/static/image/curii.logo.ai.svg b/bh20simplewebuploader/static/image/curii.logo.ai.svg
new file mode 100644
index 0000000..e87ea05
--- /dev/null
+++ b/bh20simplewebuploader/static/image/curii.logo.ai.svg
@@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg width="1333.3" height="1333.3" version="1.1" viewBox="0 0 1333.3 1333.3" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><metadata><rdf:RDF><cc:Work rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/><dc:title/></cc:Work></rdf:RDF></metadata><defs><clipPath id="clipPath18"><path d="m0 1e3h1e3v-1e3h-1e3z"/></clipPath></defs><g transform="matrix(1.3333 0 0 -1.3333 0 1333.3)"><g clip-path="url(#clipPath18)"><g transform="translate(473.7 553.52)"><path d="m0 0c-23.91 0-41.063-17.932-41.063-44.182 0-26.251 17.153-44.179 41.063-44.179 17.414 0 31.448 5.714 40.805 16.63l16.63-16.63c-12.476-16.377-32.488-25.735-58.734-25.735-41.063 0-69.653 28.329-69.653 69.914 0 41.843 28.59 70.428 69.653 70.428 24.95 0 44.439-8.053 57.174-22.611l-16.37-19.229c-9.619 9.876-22.875 15.851-39.505 15.594" fill="#008a82"/></g><g transform="translate(681.34 578.73)"><path d="m0 0v-138.26h-29.889v27.029c-9.355-19.232-26.765-27.806-49.377-28.07-31.707 0-50.676 20.276-50.676 53.281v86.022h29.884v-77.707c0-20.792 12.216-33.526 32.228-33.526 23.911 0.523 37.941 18.972 37.941 43.143v68.09z" fill="#008a82"/></g><g transform="translate(796.73 579.77)"><path d="m0 0v-28.585c-27.809 1.556-45.481-14.815-47.818-38.468v-72.248h-29.887v138.26h29.887v-27.546c9.356 18.71 25.99 28.585 47.818 28.585" fill="#008a82"/></g><g transform="translate(847.15 618.24)"><path d="m0 0c0-10.136-7.54-17.673-17.412-17.673-9.877 0-17.413 7.537-17.413 17.673 0 10.396 7.536 17.933 17.413 17.933 9.872 0 17.412-7.537 17.412-17.933" fill="#f15a29"/></g><path d="m844.54 440.47h-29.625v138.26h29.625z" fill="#008a82"/><g transform="translate(914.46 618.24)"><path d="m0 0c0-10.136-7.541-17.673-17.413-17.673-9.876 0-17.413 7.537-17.413 17.673 0 10.396 7.537 17.933 17.413 17.933 9.872 0 17.413-7.537 17.413-17.933" fill="#f15a29"/></g><path d="m911.85 440.47h-29.625v138.26h29.625z" fill="#008a82"/><g transform="translate(131.67 492.85)"><path d="m0 0c0.628 4.633 3.756 8.256 7.824 9.843l-9.454 65.551c-0.213 0.021-0.426-0.01-0.638 0.021-2.008 0.27-3.807 1.046-5.367 2.126l-57.718-52.611c2.31-21.875 9.31-42.198 19.929-60.052l46.065 29.321c-0.641 1.805-0.924 3.766-0.641 5.801" fill="#f15a29"/></g><g transform="translate(135.54 569.04)"><path d="m0 0 9.451-65.545c0.223-0.018 0.432 0.01 0.648-0.02 0.975-0.131 1.887-0.395 2.767-0.737l38.262 56.827-44.955 14.619c-1.414-2.373-3.608-4.149-6.173-5.144" fill="#f15a29"/></g><g transform="translate(280.25 609.22)"><path d="m0 0-48.729-0.574c-1.211-5.586-6.039-9.562-11.655-9.744l-24.468-36.342 53.617-17.44c0 0.809-0.074 1.62-0.051 2.43 0.912 25.152 12.634 47.22 31.286 61.67" fill="#f15a29"/></g><g transform="translate(217.14 623.42)"><path d="m0 0 2.255 58.382c-4.958 0.183-9.984 0.112-15.047-0.205-15.084-0.972-29.49-4.186-42.934-9.259l51.399-50.633c1.31 0.834 2.768 1.411 4.327 1.715" fill="#f15a29"/></g><g transform="translate(132.75 592.86)"><path d="m0 0c6.582-0.901 11.22-6.828 10.639-13.386l46.505-15.128 24.123 35.815c-4.675 2.295-7.615 7.341-6.872 12.785 0.239 1.772 0.867 3.403 1.748 4.824l-53.093 52.29c-12.31-5.236-23.718-12.06-34.013-20.225l9.958-56.937c0.33-0.028 0.661 6e-3 1.005-0.038" fill="#f15a29"/></g><g transform="translate(135.28 482.35)"><path d="m0 0-46.12-29.351c14.73-22.781 35.474-41.144 59.841-53.023l-6.781 78.911c-2.738 0.383-5.104 1.664-6.94 3.463" fill="#008a82"/></g><g transform="translate(118.77 582.23)"><path d="m0 0c0.611 4.509 3.588 8.073 7.483 9.721l-9.4 53.816c-33.716-29.174-53.94-73.126-51.126-121.01l54.559 49.732c-1.283 2.272-1.897 4.952-1.516 7.74" fill="#f15a29"/></g><g transform="translate(361.67 546.89)"><path d="m0 0c-7.048-4.637-14.905-8.13-23.401-10.048-23.806-5.372-49.043-2.852-73.21-5.508 3.301-19.489 10.484-39.441 11.517-59.119 1.522-29.071-4.033-57.766-26.257-78.143-2.545-2.344-5.248-4.393-8.036-6.28 72.262 13.991 124.59 79.812 119.73 155.21-0.078 1.306-0.227 2.592-0.345 3.888" fill="#008a82"/></g><g transform="translate(250.67 529.01)"><path d="m0 0c-8.863-1.971-17.477-5.104-25.76-9.947 7.733-9.046 15.469-18.091 23.188-27.143 4.901-5.729-4.405-12.76-9.275-7.059-7.844 9.184-15.692 18.355-23.533 27.543-3.244-2.674-6.116-5.496-8.563-8.494 13.566-16.281 27.111-32.576 40.666-48.876 4.826-5.793-4.486-12.83-9.286-7.061-12.67 15.225-25.348 30.468-38.019 45.701-2.14-4.209-3.767-8.604-4.84-13.168 8.495-9.946 16.998-19.892 25.493-29.843 4.898-5.729-4.411-12.758-9.281-7.061-6.019 7.032-12.026 14.071-18.048 21.114 0 0-4.766-16.237-0.865-52.291-1.66 0.746-3.085 2.109-3.847 4.189-9.88 27.309-16.654 62.017 0.85 88.074 12.678 18.854 32.288 30.459 53.735 36.456l-51.115 16.627-39.174-58.167c2.461-2.63 3.791-6.261 3.274-10.104-0.661-4.881-4.104-8.68-8.509-10.1l7.041-82.087c11.054-4.806 22.783-8.293 34.971-10.297 31.35 2.055 62.177 14.151 70.26 48.559 4.169 17.775 3.976 36.163 0.456 54.069-2.558 13.035-7.277 26.098-9.819 39.366" fill="#008a82"/></g><g transform="translate(312.28 562.6)"><path d="m0 0c-9.515 0-17.227 7.709-17.227 17.228 0 9.514 7.712 17.227 17.227 17.227 9.519 0 17.228-7.713 17.228-17.227 0-9.519-7.709-17.228-17.228-17.228m-87.342 118.87-2.248-58.267c4.425-1.198 7.753-4.685 8.799-9.009l56.965 0.679c7.689 4.574 16.285 7.993 25.578 9.991 2.66 0.576 4.931-0.281 6.548-1.786l-2.677 1.573c-28.575-13.342-42.238-34.715-48.462-48.807-2.258-4.6-3.79-9.265-4.739-13.983-0.01-0.048-0.03-0.179-0.03-0.179-1.08-5.452-1.428-10.967-1.175-16.536 30.725 3.518 70.847-3.875 96.101 16.526-12.633 65.686-68.086 114.92-134.66 119.8" fill="#f15a29"/></g></g></g></svg> \ No newline at end of file
diff --git a/bh20simplewebuploader/static/image/edit.png b/bh20simplewebuploader/static/image/edit.png
new file mode 100644
index 0000000..571b08c
--- /dev/null
+++ b/bh20simplewebuploader/static/image/edit.png
Binary files differ
diff --git a/bh20simplewebuploader/static/main.css b/bh20simplewebuploader/static/main.css
index b9b27f4..6e651a4 100644
--- a/bh20simplewebuploader/static/main.css
+++ b/bh20simplewebuploader/static/main.css
@@ -47,7 +47,7 @@ h2 > svg {
float: right;
}
-#map {
+#mapid {
width: 800px;
height: 440px;
border: 1px solid #AAA;
@@ -178,7 +178,7 @@ span.dropt:hover {text-decoration: none; background: #ffffff; z-index: 6; }
.about {
display: grid;
- grid-template-columns: repeat(2, 1fr);
+ grid-template-columns: 1fr 1fr;
grid-auto-flow: row;
}
@@ -229,7 +229,7 @@ a {
#metadata_fill_form {
column-count: 4;
margin-top: 0.5em;
- column-width: 250px;
+ column-width: 15em;
}
.record, .record .field-group, .record .field-group .field {
@@ -238,6 +238,8 @@ a {
-webkit-column-break-inside: avoid; /* Chrome, Safari, Opera */
page-break-inside: avoid; /* Firefox */
break-inside: avoid;
+ display: block;
+ width: 90%;
}
.record {
@@ -258,6 +260,10 @@ a {
width: max-content;
}
+.control {
+ width: 100%;
+}
+
.filter-options {
width: 100%;
}
@@ -304,9 +310,10 @@ footer {
}
.sponsors img {
- width: 80%;
- display:block;
- margin:auto;
+ width: auto;
+ display: block;
+ margin: auto;
+ height: 4em;
}
.loader {
@@ -377,3 +384,22 @@ div.status {
vertical-align: top;
border-bottom: 1px solid #ddd;
}
+
+.map {
+ padding: 20px 32px;
+ // display: inline-block;
+}
+
+.editbutton {
+ float: right;
+ text-align: right;
+ background-color: lightgrey;
+ border: 2px solid #4CAF50;
+ border-radius: 12px;
+ color: black;
+ padding: 5px 32px;
+ // text-decoration: none;
+ display: inline-block;
+ font-size: 16px;
+ box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2), 0 6px 20px 0 rgba(0,0,0,0.19);
+}
diff --git a/bh20simplewebuploader/static/main.js b/bh20simplewebuploader/static/main.js
index 751e478..1633c25 100644
--- a/bh20simplewebuploader/static/main.js
+++ b/bh20simplewebuploader/static/main.js
@@ -13,70 +13,41 @@ function myFunction() {
}
}
-let map = L.map( 'map', {
- center: [37.0902, -95.7129], // Default to U.S.A
- minZoom: 3,
- zoom: 0
-});
-L.tileLayer( 'http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
- attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a>',
- subdomains: ['a','b','c']
-}).addTo( map );
-
-let markers = L.markerClusterGroup().addTo(map)
-
-
function fetchAPI(apiEndPoint) {
- fetch(scriptRoot + apiEndPoint)
- .then(response => {
- return response.json();
- })
- .then(data => {
- console.log(data);
- markers.clearLayers();
- document.getElementById("results").classList.remove("invisible");
- document.getElementById("loader").classList.add("invisible");
- if (!(apiEndPoint === "/api/getAllaccessions")) {
- for (let i = 0; i < data.length; i++) {
- let {"count": fastaCount, GPS, LocationLabel: label } = data[i];
- let coordinates = GPS.split(" ");
- if (!(coordinates == null)) {
- let lat, lon;
- [lon, lat] = coordinates.map(parseFloat);
- let point = L.point()
- let marker = L.marker([lat, lon]);
- marker.bindPopup("<b>" + label + "</b><br/>" + "FastaCount: " +fastaCount);
- markers.addLayer(marker)
- }}
- }
- // Reload the map
- map.invalidateSize();
- });
- document.getElementById("results").classList.add("invisible");
- document.getElementById("loader").classList.remove("invisible");
-
-}
-
-// Copy from function above but now added as table instead of plain json
-function fetchAPIV2(apiEndPoint) {
- fetch(scriptRoot + apiEndPoint)
- .then(response => {
- return response.json();
- })
- .then(data => {
- console.log(data)
- htmlString="<table>"
-
- // Depending on what we want to explore we'd have to call a different function ....? But how to Include that?
- for (var i=0; i<data.length;i++) {
- htmlString=htmlString+"<tr><td><a href='#' onclick='fetchSEQByLocation(\""+data[i]["key"]+"\");'>"+data[i]["label"]+"</a></td><td>"+data[i]["count"]+"<td></tr>"
- }
- htmlString=htmlString+"</table>"
-
- document.getElementById("table").innerHTML = htmlString
- });
-
- document.getElementById("results").classList.add("invisible");
+ fetch(scriptRoot + apiEndPoint)
+ .then(response => {
+ return response.json();
+ })
+ .then(data => {
+ console.log(data);
+ });
+ document.getElementById("map_view").classList.add("invisible");
+ document.getElementById("loader").classList.remove("invisible");
+}
+
+// Copy from function above but now output HTML table instead of plain json
+function fetchHTMLTable(apiEndPoint) {
+ fetch(scriptRoot + apiEndPoint)
+ .then(response => {
+ return response.json();
+ })
+ .then(data => {
+ console.log(data)
+ htmlString="<table>"
+
+ // Depending on what we want to explore we'd have to call a different function ....? But how to Include that?
+ /*
+ for (var i=0; i<data.length;i++) {
+ htmlString=htmlString+"<tr><td><a href='#' onclick='fetchSEQByLocation(\""+data[i]["key"]+"\");'>"+data[i]["label"]+"</a></td><td>"+data[i]["count"]+"<td></tr>"
+ }
+*/
+ for (var i=0; i<data.length;i++) {
+ htmlString=htmlString+"<tr><td>"+data[i]["label"]+"</td><td>"+data[i]["count"]+"<td></tr>"
+ }
+ htmlString=htmlString+"</table>"
+
+ document.getElementById("table").innerHTML = htmlString
+ });
}
@@ -85,36 +56,39 @@ let search = () => {
fetchAPI(scriptRoot + "/api/getDetailsForSeq?seq=" + encodeURIComponent(m));
}
+// Get count from Arvados
let fetchCount = () => {
fetchAPI("/api/getCount");
}
+// Get count from Virtuoso
let fetchCountDB = () => {
fetchAPI("/api/getCountDB");
}
let fetchSEQCountBySpecimen = () => {
- fetchAPIV2("/api/getSEQCountbySpecimenSource");
+ fetchHTMLTable("/api/getSEQCountbySpecimenSource");
}
let fetchSEQCountByLocation = () => {
- fetchAPIV2("/api/getSEQCountbyLocation");
+ fetchHTMLTable("/api/getSEQCountbyLocation");
}
let fetchSEQCountByTech = () => {
- fetchAPIV2("/api/getSEQCountbytech");
+ fetchHTMLTable("/api/getSEQCountbytech");
}
let fetchAllaccessions = () => {
- fetchAPI("/api/getAllaccessions");
+ fetchHTMLTable("/api/getAllaccessions");
};
-let fetchCountByGPS = () => {
- fetchAPI("/api/getCountByGPS");
+let fetchMap = () => {
+ fetchAPI("/api/getCountByGPS");
+ updateMapMarkers();
};
let fetchSEQCountbyLocation = () => {
- fetchAPIV2("/api/getSEQCountbyLocation");
+ fetchHTMLTable("/api/getSEQCountbyLocation");
};
let fetchSEQByLocation = () => {
@@ -122,7 +96,7 @@ let fetchSEQByLocation = () => {
};
let fetchSEQCountbyContinent = () => {
- fetchAPIV2("/api/getSEQCountbyContinent");
+ fetchHTMLTable("/api/getSEQCountbyContinent");
}
@@ -195,7 +169,7 @@ function addField(e) {
// Increment the number and use the keypath and number to set IDs and cross
// references.
// TODO: Heavily dependent on the form field HTML. Maybe we want custom
- // elements for the labeled controlsd that know how to be list items?
+ // elements for the labeled controls that know how to be list items?
fieldNumber++
newField.dataset.number = fieldNumber
let newID = keypath + '[' + fieldNumber + ']'
@@ -252,36 +226,3 @@ function on_submit_button() {
return false;
}
}
-
-
-
-//
-
-function drawMap(){
-
-// initialize the map on the "map" div with a given center and zoom
-var mymap = L.map('mapid').setView([51.505, -0.09], 1);
-
-L.tileLayer('https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
- attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors'
-}).addTo(mymap);
-
-fetch(scriptRoot + "api/getCountByGPS")
- .then(response => {
- console.log(response)
- return response.json();
- })
- .then(data => {
-
- for (var i=0; i<data.length;i++) {
- gps=data[i]["GPS"].split(" ")
- var circle = L.circle([gps[1], gps[0]], {
- color: 'red',
- fillColor: '#f03',
- fillOpacity: 0.5,
- radius: parseInt(data[i]["count"]) //not working for whatever reason
- }).addTo(mymap);
- }
-
- });
-}
diff --git a/bh20simplewebuploader/static/map.js b/bh20simplewebuploader/static/map.js
new file mode 100644
index 0000000..1003f7d
--- /dev/null
+++ b/bh20simplewebuploader/static/map.js
@@ -0,0 +1,50 @@
+
+var map = L.map( 'mapid', {
+ center: [51.505, -0.09], // Default to U.S.A
+ minZoom: 2,
+ zoom: 0
+});
+
+L.tileLayer( 'http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
+ attribution: '&copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> | <a href="http://covid19.genenetwork.org/">COVID-19 PubSeq</a>',
+ subdomains: ['a','b','c']
+}).addTo(map);
+
+
+function drawMap(){
+ var mymap = map;
+
+ fetch(scriptRoot + "api/getCountByGPS")
+ .then(response => {
+ console.log(response)
+ return response.json();
+ })
+ .then(data => {
+ updateMapMarkers(data);
+
+ });
+ document.getElementById("map_view").classList.remove("invisible");
+ map.invalidateSize();
+}
+
+
+
+/* This function updates the map with markers
+ *
+*/
+function updateMapMarkers(data) {
+ let markers = L.markerClusterGroup();
+ for (let i = 0; i < data.length; i++) {
+ let {"count": fastaCount, GPS, LocationLabel: label } = data[i];
+ let coordinates = GPS.split(" ");
+ if (!(coordinates == null)) {
+ let lat, lon;
+ [lon, lat] = coordinates.map(parseFloat);
+ let point = L.point()
+ marker = (L.marker([lat, lon]));
+ marker.bindPopup("<b>" + label + "</b><br/>" + "SARS-CoV-2<br/>sequences: " +fastaCount);
+ markers.addLayer(marker);
+ }
+ }
+ map.addLayer(markers);
+}
diff --git a/bh20simplewebuploader/templates/blog.html b/bh20simplewebuploader/templates/blog.html
index dbc0b99..823f8a1 100644
--- a/bh20simplewebuploader/templates/blog.html
+++ b/bh20simplewebuploader/templates/blog.html
@@ -9,9 +9,7 @@
{{ embed|safe }}
<hr>
- <p>
- Other blog entries:
- </p>
+ <h1>Other blog entries</h1>
{% else %}
{% include 'blurb.html' %}
diff --git a/bh20simplewebuploader/templates/blurb.html b/bh20simplewebuploader/templates/blurb.html
index 80fd384..9eef7c2 100644
--- a/bh20simplewebuploader/templates/blurb.html
+++ b/bh20simplewebuploader/templates/blurb.html
@@ -2,7 +2,7 @@
This is the COVID-19 Public Sequence Resource (COVID-19 PubSeq) for
SARS-CoV-2 virus sequences. COVID-19 PubSeq is a repository for
sequences with a low barrier to entry for uploading sequence data
- using best practices. I.e., data published with a creative commons
+ using best practices, including <a href="https://en.wikipedia.org/wiki/FAIR_data">FAIR data</a>. I.e., data published with a creative commons
CC0 or CC-4.0 license with metadata using state-of-the art standards
and, perhaps most importantly, providing standardised workflows that
get triggered on upload, so that results are immediately available
diff --git a/bh20simplewebuploader/templates/demo-run.html b/bh20simplewebuploader/templates/demo-run.html
deleted file mode 100644
index a8f9edc..0000000
--- a/bh20simplewebuploader/templates/demo-run.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<section class="search-section">
- <div class="filter-options" action="#">
- <p>[Demo] Display content sequences by: </p>
- <div>
- <button class="button" onclick="fetchSEQCountBySpecimen()">Count by Specimen source</button>
- <button class="button" onclick="fetchSEQCountByLocation()">Count by Location</button>
- <button class="button" onclick="fetchSEQCountByTech()">Count by Sequencer</button>
- <button class="button" onclick="fetchAllaccessions()">Show All accessions</button>
- <button class="button" onclick="fetchSEQCountbyContinent()">Count by Continent</button>
- <button class="button" onclick="fetchCountByGPS()">Map</button>
-
- </div>
-
- </div>
-
-</section>
-<div id="loader" class="loader invisible">
-</div>
-
-<section id="results" class="invisible">
- <div id="map"></div>
-</section>
-
- <section>
- <div id="table"></div>
- </section>
diff --git a/bh20simplewebuploader/templates/demo.html b/bh20simplewebuploader/templates/demo.html
index 44aded0..75bc0e2 100644
--- a/bh20simplewebuploader/templates/demo.html
+++ b/bh20simplewebuploader/templates/demo.html
@@ -1,13 +1,51 @@
<!DOCTYPE html>
<html>
{% include 'header.html' %}
+ {% include 'mapheader.html' %}
<body>
{% include 'banner.html' %}
{% include 'menu.html' %}
- {% include 'search.html' %}
- <p>The Virtuoso database contains <span id="CounterDB"></span> public sequences!</p>
- {% include 'demo-run.html' %}
- {% include 'footer.html' %}
+
+ <p>The Virtuoso database contains <span id="CounterDB"></span> public sequences!</p>
+
+ <!--
+ <div class="search">
+ <input id="search-input" type="search" placeholder="FASTA uri" required>
+ <button class="button search-button" type="submit" onclick="search()">
+ <span class="icon ion-search">
+ <span class="sr-only">Search</span>
+ </span>
+ </button>
+ <span class="dropt" title="http://collections.lugli.arvadosapi.com/c=00fede2c6f52b053a14edca01cfa02b7+126/sequence.fasta">(example)<span style="width:500px;"></span></span>
+ </div>
+ -->
+
+ <section class="search-section">
+ <div class="filter-options" action="#">
+ <p>[Demo] Display content sequences by: </p>
+ <div>
+ <button class="button" onclick="fetchSEQCountBySpecimen()">Count by Specimen source</button>
+ <button class="button" onclick="fetchSEQCountByLocation()">Count by Location</button>
+ <button class="button" onclick="fetchSEQCountByTech()">Count by Sequencer</button>
+ <!-- <button class="button" onclick="fetchAllaccessions()">Show All accessions</button> -->
+ <button class="button" onclick="fetchSEQCountbyContinent()">Count by Continent</button>
+ </div>
+
+ </div>
+
+ </section>
+ <div id="loader" class="loader invisible">
+ </div>
+
+ <section id="map_view" class="map">
+ <div id="mapid"></div>
+ </section>
+
+ <section>
+ <div id="table"></div>
+ </section>
+
+ {% include 'footer.html' %}
<script type="text/javascript">
let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
@@ -24,7 +62,10 @@
});
});
+ drawMap()
+
</script>
+
</body>
</html>
diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html
index b1d9402..fc08aed 100644
--- a/bh20simplewebuploader/templates/error.html
+++ b/bh20simplewebuploader/templates/error.html
@@ -15,7 +15,7 @@
</pre>
</p>
<p>
- <a href="/">Click here to try again.</a>
+ <a href="/upload">Click here to try again.</a>
</p>
<hr>
</body>
diff --git a/bh20simplewebuploader/templates/footer.html b/bh20simplewebuploader/templates/footer.html
index a1dd4fd..f84cef5 100644
--- a/bh20simplewebuploader/templates/footer.html
+++ b/bh20simplewebuploader/templates/footer.html
@@ -21,14 +21,21 @@
<img src="static/image/covid19biohackathon.png"></a>
</div>
<div class="sponsorimg">
- <a href="https://www.commonwl.org/"><img src="static/image/CWL-Logo-Header.png"></a>
+ <a href="https://www.curii.com/"><img src="static/image/curii.logo.ai.png"></a>
</div>
<div class="sponsorimg">
<a href="https://arvados.org/"><img src="static/image/arvados-logo.png"></a>
</div>
<div class="sponsorimg">
+ <a href="https://www.commonwl.org/"><img src="static/image/CWL.png"></a>
+ </div>
+ <div class="sponsorimg">
<a href="https://uthsc.edu/"><img src="static/image/UTHSC-primary-stacked-logo-4c.png"></a>
</div>
+ <div class="sponsorimg">
+ <a href="https://www.esr.cri.nz/"><img src="static/image/ESR.png"></a>
+ </div>
+
</div>
</div>
<div class="footer">
@@ -40,6 +47,9 @@
</center>
</div>
</section>
+{% if load_map %}
+<script type="text/javascript" src="/static/map.js"></script>
+{% endif %}
<script type="text/javascript" src="/static/main.js"></script>
<script type="text/javascript">
diff --git a/bh20simplewebuploader/templates/header.html b/bh20simplewebuploader/templates/header.html
index 0ac5157..1d66590 100644
--- a/bh20simplewebuploader/templates/header.html
+++ b/bh20simplewebuploader/templates/header.html
@@ -6,22 +6,4 @@
{% if blog %}
<link rel="Blog stylesheet" type="text/css" href="/static/blog.css" />
{% endif %}
- <link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
- integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
- crossorigin=""/>
- <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.css"
- integrity="sha512-RLEjtaFGdC4iQMJDbMzim/dOvAu+8Qp9sw7QE4wIMYcg2goVoivzwgSZq9CsIxp4xKAZPKh5J2f2lOko2Ze6FQ=="
- crossorigin=""/>
-
- <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.Default.css"
- integrity="sha512-BBToHPBStgMiw0lD4AtkRIZmdndhB6aQbXpX7omcrXeG2PauGBl2lzq2xUZTxaLxYz5IDHlmneCZ1IJ+P3kYtQ=="
- crossorigin=""/>
-
- <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
- integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
- crossorigin=""></script>
-
- <script src="https://unpkg.com/leaflet.markercluster@1.4.1/dist/leaflet.markercluster.js"
- integrity="sha512-MQlyPV+ol2lp4KodaU/Xmrn+txc1TP15pOBF/2Sfre7MRsA/pB4Vy58bEqe9u7a7DczMLtU5wT8n7OblJepKbg=="
- crossorigin=""></script>
</head>
diff --git a/bh20simplewebuploader/templates/home.html b/bh20simplewebuploader/templates/home.html
index b90a18d..bede611 100644
--- a/bh20simplewebuploader/templates/home.html
+++ b/bh20simplewebuploader/templates/home.html
@@ -1,6 +1,7 @@
<!DOCTYPE html>
<html>
{% include 'header.html' %}
+ {% include 'mapheader.html' %}
<body>
{% include 'banner.html' %}
{% include 'menu.html' %}
@@ -44,7 +45,19 @@
</div>
</section>
-{% include 'footer.html' %}
+ <section id="map_view" class="map">
+ <div id="mapid"></div>
+ </section>
+
+ {% include 'footer.html' %}
+
+
+ <script type="text/javascript">
+ let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
+
+ drawMap()
+
+ </script>
</body>
</html>
diff --git a/bh20simplewebuploader/templates/map.html b/bh20simplewebuploader/templates/map.html
deleted file mode 100644
index 595af0c..0000000
--- a/bh20simplewebuploader/templates/map.html
+++ /dev/null
@@ -1,33 +0,0 @@
-<!DOCTYPE html>
-<html>
- {% include 'header.html' %}
-<link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
- integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
- crossorigin=""/>
-
- {% include 'banner.html' %}
- {% include 'menu.html' %}
- <div id="mapid" style="height: 500px;"></div>
-
- {% include 'footer.html' %}
-
-
-
-
- <script type="text/javascript">
- let scriptRoot = {{ request.script_root|tojson|safe }}; // examples
- </script>
-
-<!-- Make sure you put this AFTER Leaflet's CSS -->
- <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
- integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
- crossorigin=""></script>
-
- <script>
- //drawMap
- drawMap()
- </script>
-
- </body>
-
-</html>
diff --git a/bh20simplewebuploader/templates/mapheader.html b/bh20simplewebuploader/templates/mapheader.html
new file mode 100644
index 0000000..ca62051
--- /dev/null
+++ b/bh20simplewebuploader/templates/mapheader.html
@@ -0,0 +1,16 @@
+ <link rel="stylesheet" href="https://unpkg.com/leaflet@1.6.0/dist/leaflet.css"
+ integrity="sha512-xwE/Az9zrjBIphAcBb3F6JVqxf46+CDLwfLMHloNu6KEQCAWi6HcDUbeOfBIptF7tcCzusKFjFw2yuvEpDL9wQ=="
+ crossorigin=""/>
+ <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.css"
+ integrity="sha512-RLEjtaFGdC4iQMJDbMzim/dOvAu+8Qp9sw7QE4wIMYcg2goVoivzwgSZq9CsIxp4xKAZPKh5J2f2lOko2Ze6FQ=="
+ crossorigin=""/>
+ <link rel="stylesheet" href="https://unpkg.com/leaflet.markercluster@1.4.1/dist/MarkerCluster.Default.css"
+ integrity="sha512-BBToHPBStgMiw0lD4AtkRIZmdndhB6aQbXpX7omcrXeG2PauGBl2lzq2xUZTxaLxYz5IDHlmneCZ1IJ+P3kYtQ=="
+ crossorigin=""/>
+
+ <script src="https://unpkg.com/leaflet@1.6.0/dist/leaflet.js"
+ integrity="sha512-gZwIG9x3wUXg2hdXF6+rVkLF/0Vi9U8D2Ntg4Ga5I5BZpVkVxlJWbSQtXPSiUTtC0TjtGOmxa1AJPuV0CPthew=="
+ crossorigin=""></script>
+ <script src="https://unpkg.com/leaflet.markercluster@1.4.1/dist/leaflet.markercluster.js"
+ integrity="sha512-MQlyPV+ol2lp4KodaU/Xmrn+txc1TP15pOBF/2Sfre7MRsA/pB4Vy58bEqe9u7a7DczMLtU5wT8n7OblJepKbg=="
+ crossorigin=""></script>
diff --git a/bh20simplewebuploader/templates/search.html b/bh20simplewebuploader/templates/search.html
index dbdca90..e69de29 100644
--- a/bh20simplewebuploader/templates/search.html
+++ b/bh20simplewebuploader/templates/search.html
@@ -1,10 +0,0 @@
-<div class="search">
- <input id="search-input" type="search" placeholder="FASTA uri" required>
- <button class="button search-button" type="submit" onclick="search()">
- <span class="icon ion-search">
- <span class="sr-only">Search</span>
- </span>
- </button>
- <span class="dropt" title="http://collections.lugli.arvadosapi.com/c=00fede2c6f52b053a14edca01cfa02b7+126/sequence.fasta">(example)<span style="width:500px;"></span></span>
-</div>
-
diff --git a/bh20simplewebuploader/templates/status.html b/bh20simplewebuploader/templates/status.html
index a1cf28f..e89437e 100644
--- a/bh20simplewebuploader/templates/status.html
+++ b/bh20simplewebuploader/templates/status.html
@@ -7,7 +7,8 @@
<h1>Sequence upload processing status</h1>
- <div class="status">
+ <div class="status">
+
{{ table }}
</div>
diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html
index 9f0987c..c2302fa 100644
--- a/bh20simplewebuploader/templates/success.html
+++ b/bh20simplewebuploader/templates/success.html
@@ -9,7 +9,7 @@
<h1>Upload Successful</h1>
<hr>
<p>
- Your files have been uploaded. They should soon appear as output of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
+ Your files have been uploaded. You can track their <a href="/status">QC status</a>, once validated they will be part of the <a href="/download">Public SARS-CoV-2 Sequence Resource</a>.
</p>
<p>
The upload log was:
diff --git a/bh20simplewebuploader/templates/validated.html b/bh20simplewebuploader/templates/validated.html
new file mode 100644
index 0000000..cee94bd
--- /dev/null
+++ b/bh20simplewebuploader/templates/validated.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+ {% include 'header.html' %}
+ <body>
+ {% include 'banner.html' %}
+ {% include 'menu.html' %}
+
+ <h1>Validated sequences</h1>
+
+ <div class="status">
+ {{ table }}
+ </div>
+
+{% include 'footer.html' %}
+
+ </body>
+</html>
diff --git a/doc/INSTALL.md b/doc/INSTALL.md
index 6dcd72b..3b270dd 100644
--- a/doc/INSTALL.md
+++ b/doc/INSTALL.md
@@ -42,7 +42,7 @@ repository.
### Using the Web Uploader
-To run the web uploader in a GNU Guix environment/container
+To run the web uploader in a GNU Guix environment/container run it with something like
```
guix environment guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs --network openssl -- env FLASK_ENV=development PYTHONPATH=$PYTHONPATH:./bh20sequploader FLASK_APP=bh20simplewebuploader/main.py flask run
@@ -59,7 +59,7 @@ WIP: add gunicorn container
Currently the full webserver container deploy command looks like
```
-penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
-``
+penguin2:~/iwrk/opensource/code/vg/bh20-seq-resource$ env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-oinformatics/ ~/iwrk/opensource/guix/guix/pre-inst-env guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc clustalw python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_APP=bh20simplewebuploader/main.py flask run
+```
Note: see above on GUIX_PACKAGE_PATH.
diff --git a/doc/blog/using-covid-19-pubseq-part1.html b/doc/blog/using-covid-19-pubseq-part1.html
index 1959fac..0e6136c 100644
--- a/doc/blog/using-covid-19-pubseq-part1.html
+++ b/doc/blog/using-covid-19-pubseq-part1.html
@@ -3,7 +3,7 @@
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
-<!-- 2020-05-29 Fri 12:06 -->
+<!-- 2020-07-17 Fri 05:05 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>COVID-19 PubSeq (part 1)</title>
@@ -248,20 +248,20 @@ for the JavaScript code in this tag.
<h2>Table of Contents</h2>
<div id="text-table-of-contents">
<ul>
-<li><a href="#org9afe6ab">1. What does this mean?</a></li>
-<li><a href="#orgf4bc3d4">2. Fetch sequence data</a></li>
-<li><a href="#org9d7d482">3. Predicates</a></li>
-<li><a href="#orgc6046bb">4. Fetch submitter info and other metadata</a></li>
-<li><a href="#orgdcb216b">5. Fetch all sequences from Washington state</a></li>
-<li><a href="#org7060f51">6. Discussion</a></li>
-<li><a href="#orgdc51ccc">7. Acknowledgements</a></li>
+<li><a href="#org0db5db0">1. What does this mean?</a></li>
+<li><a href="#orge5267fd">2. Fetch sequence data</a></li>
+<li><a href="#orgfbd3adc">3. Predicates</a></li>
+<li><a href="#org08e70e1">4. Fetch submitter info and other metadata</a></li>
+<li><a href="#org9194557">5. Fetch all sequences from Washington state</a></li>
+<li><a href="#org76317ad">6. Discussion</a></li>
+<li><a href="#orgeb871a1">7. Acknowledgements</a></li>
</ul>
</div>
</div>
-<div id="outline-container-org9afe6ab" class="outline-2">
-<h2 id="org9afe6ab"><span class="section-number-2">1</span> What does this mean?</h2>
+<div id="outline-container-org0db5db0" class="outline-2">
+<h2 id="org0db5db0"><span class="section-number-2">1</span> What does this mean?</h2>
<div class="outline-text-2" id="text-1">
<p>
This means that when someone uploads a SARS-CoV-2 sequence using one
@@ -274,24 +274,24 @@ expressed in a <a href="https://github.com/arvados/bh20-seq-resource/blob/master
type: record
fields:
host_species:
- doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_<span style="color: #8bc34a;">9606</span> for Homo sapiens
+ doc: Host species as defined in NCBITaxon, e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens
type: string
jsonldPredicate:
- _id: http://www.ebi.ac.uk/efo/EFO_<span style="color: #8bc34a;">0000532</span>
- _type: <span style="color: #9ccc65;">"@id"</span>
- noLinkCheck: <span style="color: #8bc34a;">true</span>
+ _id: http://www.ebi.ac.uk/efo/EFO_0000532
+ _type: "@id"
+ noLinkCheck: true
host_sex:
- doc: Sex of the host as defined in PATO, expect male <span style="color: #e91e63;">()</span> or female <span style="color: #e91e63;">()</span>
+ doc: Sex of the host as defined in PATO, expect male () or female ()
type: string?
jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/PATO_<span style="color: #8bc34a;">0000047</span>
- _type: <span style="color: #9ccc65;">"@id"</span>
- noLinkCheck: <span style="color: #8bc34a;">true</span>
+ _id: http://purl.obolibrary.org/obo/PATO_0000047
+ _type: "@id"
+ noLinkCheck: true
host_age:
- doc: Age of the host as number <span style="color: #e91e63;">(</span>e.g. <span style="color: #8bc34a;">50</span><span style="color: #e91e63;">)</span>
+ doc: Age of the host as number (e.g. 50)
type: int?
jsonldPredicate:
- _id: http://purl.obolibrary.org/obo/PATO_<span style="color: #8bc34a;">0000011</span>
+ _id: http://purl.obolibrary.org/obo/PATO_0000011
</pre>
</div>
@@ -314,8 +314,8 @@ initiative!
</div>
-<div id="outline-container-orgf4bc3d4" class="outline-2">
-<h2 id="orgf4bc3d4"><span class="section-number-2">2</span> Fetch sequence data</h2>
+<div id="outline-container-orge5267fd" class="outline-2">
+<h2 id="orge5267fd"><span class="section-number-2">2</span> Fetch sequence data</h2>
<div class="outline-text-2" id="text-2">
<p>
The latest run of the pipeline can be viewed <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">here</a>. Each of these
@@ -339,8 +339,8 @@ these identifiers throughout.
</div>
</div>
-<div id="outline-container-org9d7d482" class="outline-2">
-<h2 id="org9d7d482"><span class="section-number-2">3</span> Predicates</h2>
+<div id="outline-container-orgfbd3adc" class="outline-2">
+<h2 id="orgfbd3adc"><span class="section-number-2">3</span> Predicates</h2>
<div class="outline-text-2" id="text-3">
<p>
To explore an RDF dataset, the first query we can do is open and gets
@@ -350,10 +350,10 @@ the following in a SPARQL end point
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?p
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">select distinct ?p
+{
?o ?p ?s
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -364,10 +364,10 @@ To get a <a href="http://sparql.genenetwork.org/sparql/?default-graph-uri=&amp;q
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?g
-<span style="color: #e91e63;">{</span>
- GRAPH ?g <span style="color: #2196F3;">{</span>?s ?p ?o<span style="color: #2196F3;">}</span>
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select distinct ?g
+{
+ GRAPH ?g {?s ?p ?o}
+}
</pre>
</div>
@@ -383,10 +383,10 @@ To list all submitters, try
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?s
-<span style="color: #e91e63;">{</span>
- ?o <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">#MainSchema/submitter&gt;</a> ?s
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select distinct ?s
+{
+ ?o &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/submitter&gt; ?s
+}
</pre>
</div>
@@ -397,11 +397,11 @@ and by
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?s
-<span style="color: #e91e63;">{</span>
- ?o <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/submitter">#MainSchema/submitter&gt;</a> ?id .
+<pre class="src src-sql">select distinct ?s
+{
+ ?o &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/submitter&gt; ?id .
?id ?p ?s
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -415,12 +415,12 @@ To lift the full URL out of the query you can use a header like
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -438,32 +438,32 @@ Now we got this far, lets <a href="http://sparql.genenetwork.org/sparql/?default
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #e91e63;">(</span><span style="color: #ff8A65;">COUNT</span><span style="color: #2196F3;">(</span><span style="color: #fff59d;">distinct</span> ?dataset<span style="color: #2196F3;">)</span> <span style="color: #fff59d;">as</span> ?num<span style="color: #e91e63;">)</span>
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select (COUNT(distinct ?dataset) as ?num)
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
</div>
</div>
-<div id="outline-container-orgc6046bb" class="outline-2">
-<h2 id="orgc6046bb"><span class="section-number-2">4</span> Fetch submitter info and other metadata</h2>
+<div id="outline-container-org08e70e1" class="outline-2">
+<h2 id="org08e70e1"><span class="section-number-2">4</span> Fetch submitter info and other metadata</h2>
<div class="outline-text-2" id="text-4">
<p>
To get dataests with submitters we can do the above
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?p ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?p ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -480,13 +480,13 @@ Let's focus on one sample with
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?dataset ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter .
- FILTER<span style="color: #2196F3;">(</span><span style="color: #fff59d;">CONTAINS</span><span style="color: #EF6C00;">(</span>?submitter,"Roychoudhury"<span style="color: #EF6C00;">)</span><span style="color: #2196F3;">)</span> .
-<span style="color: #e91e63;">}</span>
+ FILTER(CONTAINS(?submitter,"Roychoudhury")) .
+}
</pre>
</div>
@@ -496,12 +496,12 @@ see if we can get a sample ID by listing sample predicates
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?p
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?p
+{
?dataset ?p ?o .
?dataset pubseq:submitter ?id .
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -513,15 +513,15 @@ Let's zoom in on those of Roychoudhury with
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/">#MainSchema/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?sid ?sample ?p1 ?dataset ?submitter
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+select distinct ?sid ?sample ?p1 ?dataset ?submitter
+{
?dataset pubseq:submitter ?id .
?id ?p ?submitter .
- FILTER<span style="color: #2196F3;">(</span><span style="color: #fff59d;">CONTAINS</span><span style="color: #EF6C00;">(</span>?submitter,"Roychoudhury"<span style="color: #EF6C00;">)</span><span style="color: #2196F3;">)</span> .
+ FILTER(CONTAINS(?submitter,"Roychoudhury")) .
?dataset pubseq:sample ?sid .
?sid ?p1 ?sample
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -532,18 +532,13 @@ this database. Let's focus on one sample "MT326090.1" with predicate
</p>
<div class="org-src-container">
-<pre class="src src-sql"><span style="color: #fff59d;">PREFIX</span> pubseq: <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">#MainSchema/&gt;
-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/">PREFIX</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
-PREFIX sio: &lt;http://semanticscience.org/resource/"> sio: &lt;http://semanticscience.org/resource/&gt;</a>
-<span style="color: #fff59d;">select</span> <span style="color: #fff59d;">distinct</span> ?sample ?p ?o
-<span style="color: #e91e63;">{</span>
+<pre class="src src-sql">PREFIX pubseq: &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/&gt;
+PREFIX sio: &lt;http://semanticscience.org/resource/&gt;
+select distinct ?sample ?p ?o
+{
?sample sio:SIO_000115 "MT326090.1" .
?sample ?p ?o .
-<span style="color: #e91e63;">}</span>
+}
</pre>
</div>
@@ -561,8 +556,8 @@ to view/query the database.
</div>
</div>
-<div id="outline-container-orgdcb216b" class="outline-2">
-<h2 id="orgdcb216b"><span class="section-number-2">5</span> Fetch all sequences from Washington state</h2>
+<div id="outline-container-org9194557" class="outline-2">
+<h2 id="org9194557"><span class="section-number-2">5</span> Fetch all sequences from Washington state</h2>
<div class="outline-text-2" id="text-5">
<p>
Now we know how to get at the origin we can do it the other way round
@@ -570,15 +565,11 @@ and fetch all sequences referring to Washington state
</p>
<div class="org-src-container">
-<pre class="src src-sql">
-<span style="color: #fff59d;">select</span> ?seq ?sample
-<span style="color: #e91e63;">{</span>
- ?seq <a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
- ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">&lt;http://biohackathon.org/bh20-seq-</a><span style="color: #fff59d;"><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
- ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">schema</a></span><a href="http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
- ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223">#MainSchema/sample&gt; ?sample .
- ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223&gt;</a>
-<span style="color: #e91e63;">}</span>
+<pre class="src src-sql">select ?seq ?sample
+{
+ ?seq &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
+ ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q1223&gt;
+}
</pre>
</div>
@@ -586,11 +577,26 @@ and fetch all sequences referring to Washington state
which lists 300 sequences originating from Washington state! Which is almost
half of the set coming out of GenBank.
</p>
+
+<p>
+Likewise to list all sequences from Turkey we can find the wikidata
+entity is <a href="https://www.wikidata.org/wiki/Q43">Q43</a>:
+</p>
+
+<div class="org-src-container">
+<pre class="src src-sql">select ?seq ?sample
+{
+ ?seq &lt;http://biohackathon.org/bh20-seq-schema#MainSchema/sample&gt; ?sample .
+ ?sample &lt;http://purl.obolibrary.org/obo/GAZ_00000448&gt; &lt;http://www.wikidata.org/entity/Q43&gt;
+}
+</pre>
</div>
</div>
+</div>
+
-<div id="outline-container-org7060f51" class="outline-2">
-<h2 id="org7060f51"><span class="section-number-2">6</span> Discussion</h2>
+<div id="outline-container-org76317ad" class="outline-2">
+<h2 id="org76317ad"><span class="section-number-2">6</span> Discussion</h2>
<div class="outline-text-2" id="text-6">
<p>
The public sequence uploader collects sequences, raw data and
@@ -601,8 +607,8 @@ referenced in publications and origins are citeable.
</div>
</div>
-<div id="outline-container-orgdc51ccc" class="outline-2">
-<h2 id="orgdc51ccc"><span class="section-number-2">7</span> Acknowledgements</h2>
+<div id="outline-container-orgeb871a1" class="outline-2">
+<h2 id="orgeb871a1"><span class="section-number-2">7</span> Acknowledgements</h2>
<div class="outline-text-2" id="text-7">
<p>
The overall effort was due to magnificent freely donated input by a
@@ -617,7 +623,7 @@ Garrison this initiative would not have existed!
</div>
</div>
<div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-05-29 Fri 12:06</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-17 Fri 05:02</small>.
</div>
</body>
</html>
diff --git a/doc/blog/using-covid-19-pubseq-part4.html b/doc/blog/using-covid-19-pubseq-part4.html
index b5a05ca..c975c21 100644
--- a/doc/blog/using-covid-19-pubseq-part4.html
+++ b/doc/blog/using-covid-19-pubseq-part4.html
@@ -3,7 +3,7 @@
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
-<!-- 2020-07-12 Sun 06:24 -->
+<!-- 2020-07-17 Fri 05:04 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>COVID-19 PubSeq (part 4)</title>
@@ -161,6 +161,19 @@
.footdef { margin-bottom: 1em; }
.figure { padding: 1em; }
.figure p { text-align: center; }
+ .equation-container {
+ display: table;
+ text-align: center;
+ width: 100%;
+ }
+ .equation {
+ vertical-align: middle;
+ }
+ .equation-label {
+ display: table-cell;
+ text-align: right;
+ vertical-align: middle;
+ }
.inlinetask {
padding: 10px;
border: 2px solid gray;
@@ -186,7 +199,7 @@
@licstart The following is the entire license notice for the
JavaScript code in this tag.
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
+Copyright (C) 2012-2020 Free Software Foundation, Inc.
The JavaScript code in this tag is free software: you can
redistribute it and/or modify it under the terms of the GNU
@@ -235,15 +248,16 @@ for the JavaScript code in this tag.
<h2>Table of Contents</h2>
<div id="text-table-of-contents">
<ul>
-<li><a href="#org8f8b64a">1. What does this mean?</a></li>
-<li><a href="#orgcc7a403">2. Modify Workflow</a></li>
+<li><a href="#orgc2ee09f">1. What does this mean?</a></li>
+<li><a href="#org0d37881">2. Where can I find the workflows?</a></li>
+<li><a href="#orgddb0531">3. Modify Workflow</a></li>
</ul>
</div>
</div>
-<div id="outline-container-org8f8b64a" class="outline-2">
-<h2 id="org8f8b64a"><span class="section-number-2">1</span> What does this mean?</h2>
+<div id="outline-container-orgc2ee09f" class="outline-2">
+<h2 id="orgc2ee09f"><span class="section-number-2">1</span> What does this mean?</h2>
<div class="outline-text-2" id="text-1">
<p>
This means that when someone uploads a SARS-CoV-2 sequence using one
@@ -253,18 +267,28 @@ which triggers a rerun of our workflows.
</div>
</div>
-
-<div id="outline-container-orgcc7a403" class="outline-2">
-<h2 id="orgcc7a403"><span class="section-number-2">2</span> Modify Workflow</h2>
+<div id="outline-container-org0d37881" class="outline-2">
+<h2 id="org0d37881"><span class="section-number-2">2</span> Where can I find the workflows?</h2>
<div class="outline-text-2" id="text-2">
<p>
+Workflows are written in the common workflow language (CWL) and listed
+on <a href="https://github.com/arvados/bh20-seq-resource/tree/master/workflows">github</a>. PubSeq being an open project these workflows can be studied
+and modified!
+</p>
+</div>
+</div>
+
+<div id="outline-container-orgddb0531" class="outline-2">
+<h2 id="orgddb0531"><span class="section-number-2">3</span> Modify Workflow</h2>
+<div class="outline-text-2" id="text-3">
+<p>
<i>Work in progress!</i>
</p>
</div>
</div>
</div>
<div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-12 Sun 06:24</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-17 Fri 01:47</small>.
</div>
</body>
</html>
diff --git a/doc/blog/using-covid-19-pubseq-part4.org b/doc/blog/using-covid-19-pubseq-part4.org
index 5fe71d1..8ad5e2d 100644
--- a/doc/blog/using-covid-19-pubseq-part4.org
+++ b/doc/blog/using-covid-19-pubseq-part4.org
@@ -10,6 +10,7 @@
* Table of Contents :TOC:noexport:
- [[#what-does-this-mean][What does this mean?]]
+ - [[#where-can-i-find-the-workflows][Where can I find the workflows?]]
- [[#modify-workflow][Modify Workflow]]
* What does this mean?
@@ -18,6 +19,11 @@ This means that when someone uploads a SARS-CoV-2 sequence using one
of our tools (CLI or web-based) they add a sequence and some metadata
which triggers a rerun of our workflows.
+* Where can I find the workflows?
+
+Workflows are written in the common workflow language (CWL) and listed
+on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]]. PubSeq being an open project these workflows can be studied
+and modified!
* Modify Workflow
diff --git a/doc/blog/using-covid-19-pubseq-part5.html b/doc/blog/using-covid-19-pubseq-part5.html
index 80bf559..4caa5ac 100644
--- a/doc/blog/using-covid-19-pubseq-part5.html
+++ b/doc/blog/using-covid-19-pubseq-part5.html
@@ -3,7 +3,7 @@
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head>
-<!-- 2020-07-12 Sun 06:24 -->
+<!-- 2020-07-17 Fri 05:03 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>COVID-19 PubSeq (part 4)</title>
@@ -161,6 +161,19 @@
.footdef { margin-bottom: 1em; }
.figure { padding: 1em; }
.figure p { text-align: center; }
+ .equation-container {
+ display: table;
+ text-align: center;
+ width: 100%;
+ }
+ .equation {
+ vertical-align: middle;
+ }
+ .equation-label {
+ display: table-cell;
+ text-align: right;
+ vertical-align: middle;
+ }
.inlinetask {
padding: 10px;
border: 2px solid gray;
@@ -186,7 +199,7 @@
@licstart The following is the entire license notice for the
JavaScript code in this tag.
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
+Copyright (C) 2012-2020 Free Software Foundation, Inc.
The JavaScript code in this tag is free software: you can
redistribute it and/or modify it under the terms of the GNU
@@ -235,38 +248,40 @@ for the JavaScript code in this tag.
<h2>Table of Contents</h2>
<div id="text-table-of-contents">
<ul>
-<li><a href="#org871ad58">1. Modify Metadata</a></li>
-<li><a href="#org07e8755">2. What is the schema?</a></li>
-<li><a href="#org4857280">3. How is the website generated?</a></li>
-<li><a href="#orge709ae2">4. Modifying the schema</a></li>
+<li><a href="#org758b923">1. Modify Metadata</a></li>
+<li><a href="#orgec32c13">2. What is the schema?</a></li>
+<li><a href="#org2e487b2">3. How is the website generated?</a></li>
+<li><a href="#orge4dfe84">4. Modifying the schema</a></li>
+<li><a href="#org564a7a8">5. Adding fields to the form</a></li>
+<li><a href="#org633781a">6. <span class="todo TODO">TODO</span> Testing the license fields</a></li>
</ul>
</div>
</div>
-<div id="outline-container-org871ad58" class="outline-2">
-<h2 id="org871ad58"><span class="section-number-2">1</span> Modify Metadata</h2>
+<div id="outline-container-org758b923" class="outline-2">
+<h2 id="org758b923"><span class="section-number-2">1</span> Modify Metadata</h2>
<div class="outline-text-2" id="text-1">
<p>
The public sequence resource uses multiple data formats listed on the
-<a href="./download">DOWNLOAD</a> page. One of the most exciting features is the full support
+<a href="http://covid19.genenetwork.org/download">download</a> page. One of the most exciting features is the full support
for RDF and semantic web/linked data ontologies. This technology
allows for querying data in unprescribed ways - that is, you can
formulate your own queries without dealing with a preset model of that
data (so typical of CSV files and SQL tables). Examples of exploring
-data are listed <a href="./blog?id=using-covid-19-pubseq-part1">here</a>.
+data are listed <a href="http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1">here</a>.
</p>
<p>
In this BLOG we are going to look at the metadata entered on the
-<a href="./">COVID-19 PubSeq</a> website (or command line client). It is important to
+COVID-19 PubSeq website (or command line client). It is important to
understand that anyone, including you, can change that information!
</p>
</div>
</div>
-<div id="outline-container-org07e8755" class="outline-2">
-<h2 id="org07e8755"><span class="section-number-2">2</span> What is the schema?</h2>
+<div id="outline-container-orgec32c13" class="outline-2">
+<h2 id="orgec32c13"><span class="section-number-2">2</span> What is the schema?</h2>
<div class="outline-text-2" id="text-2">
<p>
The default metadata schema is listed <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml">here</a>.
@@ -274,8 +289,8 @@ The default metadata schema is listed <a href="https://github.com/arvados/bh20-s
</div>
</div>
-<div id="outline-container-org4857280" class="outline-2">
-<h2 id="org4857280"><span class="section-number-2">3</span> How is the website generated?</h2>
+<div id="outline-container-org2e487b2" class="outline-2">
+<h2 id="org2e487b2"><span class="section-number-2">3</span> How is the website generated?</h2>
<div class="outline-text-2" id="text-3">
<p>
Using the schema we use <a href="https://pypi.org/project/PyShEx/">pyshex</a> shex expressions and <a href="https://github.com/common-workflow-language/schema_salad">schema salad</a> to
@@ -285,13 +300,13 @@ All from that one metadata schema.
</div>
</div>
-<div id="outline-container-orge709ae2" class="outline-2">
-<h2 id="orge709ae2"><span class="section-number-2">4</span> Modifying the schema</h2>
+<div id="outline-container-orge4dfe84" class="outline-2">
+<h2 id="orge4dfe84"><span class="section-number-2">4</span> Modifying the schema</h2>
<div class="outline-text-2" id="text-4">
<p>
-One of the first things we wanted to do is to add a field for the data
-license. Initially we only support CC-4.0 as a license by default, but
-now we want to give uploaders the option to make it an even more
+One of the first things we want to do is to add a field for the data
+license. Initially we only supported CC-4.0 as a license, but
+we wanted to give uploaders the option to use an even more
liberal CC0 license. The first step is to find a good ontology term
for the field. Searching for `creative commons cc0 rdf' rendered this
useful <a href="https://creativecommons.org/ns">page</a>. We also find an <a href="https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview">overview</a> where CC0 is represented as URI
@@ -302,13 +317,148 @@ attributionName and attributionURL.
</p>
<p>
-<i>Note: work in progress</i>
+A minimal triple should be
+</p>
+
+<pre class="example">
+id xhtml:license &lt;http://creativecommons.org/licenses/by/4.0/&gt; .
+</pre>
+
+
+<p>
+Other suggestions are
+</p>
+
+<pre class="example">
+id dc:title "Description" .
+id cc:attributionName "Your Name" .
+id cc:attributionURL &lt;http://resource.org/id&gt;
+</pre>
+
+
+<p>
+and 'dc:source' which indicates the original source of any modified
+work, specified as a URI.
+The prefix 'cc:' is an abbreviation for <a href="http://creativecommons.org/ns">http://creativecommons.org/ns</a>#.
+</p>
+
+<p>
+Going back to the schema, where does it fit? Under host, sample,
+virus, technology or submitter block? It could fit under sample, but
+actually the license concerns the whole metadata block and sequence,
+so I think we can fit under its own license tag. For example
+</p>
+
+
+<p>
+id: placeholder
+</p>
+
+<pre class="example">
+license:
+ license_type: http://creativecommons.org/licenses/by/4.0/
+ attribution_title: "Sample ID"
+ attribution_name: "John doe, Joe Boe, Jonny Oe"
+ attribution_url: http://covid19.genenetwork.org/id
+ attribution_source: https://www.ncbi.nlm.nih.gov/pubmed/323088888
+</pre>
+
+
+<p>
+So, let's update the example. Notice the license info is optional - if it is missing
+we just assume the default CC-4.0.
+</p>
+
+<p>
+One thing that is interesting is that in the name space <a href="https://creativecommons.org/ns">https://creativecommons.org/ns</a> there
+is no mention of a title. I think it is useful, however, because we have no such field.
+So, we'll add it simply as a title field. Now the draft schema is
</p>
+
+<div class="org-src-container">
+<pre class="src src-js">- name: licenseSchema
+ type: record
+ fields:
+ license_type:
+ doc: License types as refined in https://wiki.creativecommons.org/images/d/d6/Ccrel-1.0.pdf
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#License
+ title:
+ doc: Attribution title related to license
+ type: string?
+ jsonldPredicate:
+ _id: http://semanticscience.org/resource/SIO_001167
+ attribution_url:
+ doc: Attribution URL related to license
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#Work
+ attribution_source:
+ doc: Attribution source URL
+ type: string?
+ jsonldPredicate:
+ _id: https://creativecommons.org/ns#Work
+</pre>
+</div>
+
+<p>
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted <a href="https://github.com/arvados/bh20-seq-resource/pull/97">here</a> and reviewed on the
+gitter channel and I merged it.
+</p>
+</div>
</div>
+
+<div id="outline-container-org564a7a8" class="outline-2">
+<h2 id="org564a7a8"><span class="section-number-2">5</span> Adding fields to the form</h2>
+<div class="outline-text-2" id="text-5">
+<p>
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in <a href="https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229">main.py</a> in the 'generate<sub>form</sub>' function.
+</p>
+
+<p>
+With this <a href="https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3">patch</a> the website adds the license input fields on the form.
+</p>
+
+<p>
+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This
+was done with this <a href="https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc">patch</a>. In the end we decided to use the Dublin core title,
+<a href="http://purl.org/metadata/dublin_core_elements#Title">http://purl.org/metadata/dublin_core_elements#Title</a>:
+</p>
+
+<div class="org-src-container">
+<pre class="src src-js">:licenseShape{
+ cc:License xsd:string;
+ dc:Title xsd:string ?;
+ cc:attributionName xsd:string ?;
+ cc:attributionURL xsd:string ?;
+ cc:attributionSource xsd:string ?;
+}
+</pre>
+</div>
+
+<p>
+Note that cc:AttributionSource is not really defined in the cc standard.
+</p>
+
+<p>
+When pushing the license info we discovered the workflow broke because
+the existing data had no licensing info. So we changed the license
+field to be optional - a missing license assumes it is CC-BY-4.0.
+</p>
+</div>
+</div>
+
+<div id="outline-container-org633781a" class="outline-2">
+<h2 id="org633781a"><span class="section-number-2">6</span> <span class="todo TODO">TODO</span> Testing the license fields</h2>
</div>
</div>
<div id="postamble" class="status">
-<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-12 Sun 06:24</small>.
+<hr><small>Created by <a href="http://thebird.nl/">Pjotr Prins</a> (pjotr.public768 at thebird 'dot' nl) using Emacs org-mode and a healthy dose of Lisp!<br />Modified 2020-07-16 Thu 03:27</small>.
</div>
</body>
</html>
diff --git a/doc/blog/using-covid-19-pubseq-part5.org b/doc/blog/using-covid-19-pubseq-part5.org
index 4b0ea64..78eea66 100644
--- a/doc/blog/using-covid-19-pubseq-part5.org
+++ b/doc/blog/using-covid-19-pubseq-part5.org
@@ -13,19 +13,21 @@
- [[#what-is-the-schema][What is the schema?]]
- [[#how-is-the-website-generated][How is the website generated?]]
- [[#modifying-the-schema][Modifying the schema]]
+ - [[#adding-fields-to-the-form][Adding fields to the form]]
+ - [[#testing-the-license-fields][Testing the license fields]]
* Modify Metadata
The public sequence resource uses multiple data formats listed on the
-[[./download][DOWNLOAD]] page. One of the most exciting features is the full support
+[[http://covid19.genenetwork.org/download][download]] page. One of the most exciting features is the full support
for RDF and semantic web/linked data ontologies. This technology
allows for querying data in unprescribed ways - that is, you can
formulate your own queries without dealing with a preset model of that
data (so typical of CSV files and SQL tables). Examples of exploring
-data are listed [[./blog?id=using-covid-19-pubseq-part1][here]].
+data are listed [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][here]].
In this BLOG we are going to look at the metadata entered on the
-[[./][COVID-19 PubSeq]] website (or command line client). It is important to
+COVID-19 PubSeq website (or command line client). It is important to
understand that anyone, including you, can change that information!
* What is the schema?
@@ -41,8 +43,8 @@ All from that one metadata schema.
* Modifying the schema
One of the first things we want to do is to add a field for the data
-license. Initially we only support CC-4.0 as a license by default, but
-now we want to give uploaders the option to make it an even more
+license. Initially we only supported CC-4.0 as a license, but
+we wanted to give uploaders the option to use an even more
liberal CC0 license. The first step is to find a good ontology term
for the field. Searching for `creative commons cc0 rdf' rendered this
useful [[https://creativecommons.org/ns][page]]. We also find an [[https://wiki.creativecommons.org/wiki/CC_License_Rdf_Overview][overview]] where CC0 is represented as URI
@@ -113,8 +115,37 @@ So, we'll add it simply as a title field. Now the draft schema is
_id: https://creativecommons.org/ns#Work
#+END_SRC
-Now, we are no ontology experts, right? So, next we submit a patch to our source tree and
-ask for feedback before wiring it up in the data entry form. The pull request was
-submitted here FIXME.
+Now, we are no ontology experts, right? So, next we submit a patch to
+our source tree and ask for feedback before wiring it up in the data
+entry form. The pull request was submitted [[https://github.com/arvados/bh20-seq-resource/pull/97][here]] and reviewed on the
+gitter channel and I merged it.
-/Note: work in progress/
+* Adding fields to the form
+
+To add the new fields to the form we have to modify it a little. If we
+go to the upload form we need to add the license box. The schema is
+loaded in [[https://github.com/arvados/bh20-seq-resource/blob/a0c8ebd57b875f265e8b0efec4abfaf892eb6c45/bh20simplewebuploader/main.py#L229][main.py]] in the 'generate_form' function.
+
+With this [[https://github.com/arvados/bh20-seq-resource/commit/b9691c7deae30bd6422fb7b0681572b7b6f78ae3][patch]] the website adds the license input fields on the form.
+
+Finally, to make RDF output work we need to add expressions to bh20seq-shex.rdf. This
+was done with this [[https://github.com/arvados/bh20-seq-resource/commit/f4ed46dae20abe5147871495ede2d6ac2b0854bc][patch]]. In the end we decided to use the Dublin core title,
+http://purl.org/metadata/dublin_core_elements#Title:
+
+#+BEGIN_SRC js
+:licenseShape{
+ cc:License xsd:string;
+ dc:Title xsd:string ?;
+ cc:attributionName xsd:string ?;
+ cc:attributionURL xsd:string ?;
+ cc:attributionSource xsd:string ?;
+}
+#+END_SRC
+
+Note that cc:AttributionSource is not really defined in the cc standard.
+
+When pushing the license info we discovered the workflow broke because
+the existing data had no licensing info. So we changed the license
+field to be optional - a missing license assumes it is CC-BY-4.0.
+
+* TODO Testing the license fields
diff --git a/doc/web/about.org b/doc/web/about.org
index ad13bc3..1949e2d 100644
--- a/doc/web/about.org
+++ b/doc/web/about.org
@@ -140,7 +140,8 @@ See the [[http://covid19.genenetwork.org/blog]]!
* How do I change the work flows?
-See the [[http://covid19.genenetwork.org/blog]]!
+Workflows are on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]] and can be modified. See also the
+[[[[http://covid19.genenetwork.org/blog]]][workflow blog]].
* How do I change the source code?
diff --git a/example/minimal_metadata_example.yaml b/example/minimal_metadata_example.yaml
index 51f8a87..1b46cc7 100644
--- a/example/minimal_metadata_example.yaml
+++ b/example/minimal_metadata_example.yaml
@@ -1,5 +1,9 @@
id: placeholder
+
+license:
+ license_type: http://creativecommons.org/licenses/by/4.0/
+
host:
host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
@@ -15,4 +19,4 @@ technology:
sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
submitter:
- authors: [John Doe] \ No newline at end of file
+ authors: [John Doe]
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
new file mode 100644
index 0000000..78f34c8
--- /dev/null
+++ b/scripts/cleanup.py
@@ -0,0 +1,41 @@
+import arvados
+import arvados.util
+
+api = arvados.api()
+
+delete_patterns = [
+ "%missing%`collection_location`%",
+ "%missing%`technology`%",
+ "%missing%`host_species`%",
+ "%QC fail: alignment%",
+ "%does not look like a valid URI%",
+ "%Duplicate of%",
+ "%No matching triples found for predicate obo:NCIT_C42781%",
+ "%does not look like a valid URI%"
+ ]
+
+revalidate_patterns = [
+ "%missing%`license`%",
+ "%QC fail%"
+]
+
+for p in delete_patterns:
+ c = arvados.util.list_all(api.collections().list, filters=[
+ ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+ ["properties.errors", "like", p]])
+ for i in c:
+ print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+ api.collections().delete(uuid=i["uuid"]).execute()
+
+for p in revalidate_patterns:
+ c = arvados.util.list_all(api.collections().list, filters=[
+ ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+ ["properties.errors", "like", p]])
+ for i in c:
+ print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+ pr = i["properties"]
+ if "status" in pr:
+ del pr["status"]
+ if "errors" in pr:
+ del pr["errors"]
+ api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
deleted file mode 100644
index 88acb18..0000000
--- a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
+++ /dev/null
Binary files differ
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
new file mode 100644
index 0000000..93ef550
--- /dev/null
+++ b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
Binary files differ
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index ef0d119..352a30e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -8,7 +8,7 @@ import gzip
dir_yaml = 'yaml'
-date = '2020.07.05'
+date = '2020.07.09'
# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
# Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
@@ -50,13 +50,14 @@ sra_metadata_xml_file.close()
EXPERIMENT_PACKAGE_SET = tree.getroot()
missing_value_list = []
+not_created_accession_list = []
run_accession_set = set()
run_accession_to_downloadble_file_url_dict = {}
for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(i, EXPERIMENT_PACKAGE)
-
+
# A general default-empty yaml could be read from the definitive one
info_for_yaml_dict = {
'id': 'placeholder',
@@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(accession)
info_for_yaml_dict['sample']['sample_id'] = accession
-
+
#SRAFiles = RUN.find('SRAFiles')
#if SRAFiles is not None:
# url = SRAFiles.find('SRAFile').attrib['url']
# if 'sra-download.ncbi.nlm.nih.gov' in url:
# run_accession_to_downloadble_file_url_dict[accession] = url
-
+
SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
-
+
for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
if VALUE is not None:
@@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
elif TAG_text in ['host_health_status', 'host health state']:
if VALUE_text in term_to_uri_dict:
- info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
+ info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
elif TAG_text in ['strain', 'isolate']:
@@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if value_to_insert in term_to_uri_dict:
value_to_insert = term_to_uri_dict[value_to_insert]
-
- if 'virus_strain' not in info_for_yaml_dict:
+
+ if 'virus_strain' not in info_for_yaml_dict:
info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
else:
info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
- elif TAG_text in ['isolation_source', 'isolation source host-associated']:
+ elif TAG_text in ['isolation_source', 'isolation source host-associated']:
if VALUE_text in term_to_uri_dict:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
else:
@@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
elif TAG_text == 'collected_by':
if VALUE_text.lower() not in ['not available', 'missing']:
name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
-
+
info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
elif TAG_text == 'collecting institution':
if VALUE_text.lower() not in ['not provided', 'na']:
@@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
date_to_write = VALUE_text
date_is_estimated = True
-
+
VALUE_text_list = VALUE_text.split('-')
if len(VALUE_text_list) == 3:
date_is_estimated = False
-
+
if VALUE_text_list[1].isalpha():
date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
elif len(VALUE_text_list) == 2:
@@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
date_to_write = "{}-01-15".format(VALUE_text)
info_for_yaml_dict['sample']['collection_date'] = date_to_write
-
+
if date_is_estimated:
if 'additional_collection_information' in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
@@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
-
-
+
+
EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
@@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
-
+
if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
- STUDY = EXPERIMENT_PACKAGE.find('STUDY')
+ STUDY = EXPERIMENT_PACKAGE.find('STUDY')
info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
-
-
+
+
Organization = EXPERIMENT_PACKAGE.find('Organization')
Organization_Name = Organization.find('Name')
info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
-
+
Organization_Contact = Organization.find('Contact')
if Organization_Contact is not None:
Organization_Contact_Name = Organization_Contact.find('Name')
@@ -231,20 +232,33 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
Organization_Address = Organization.find('Address')
if Organization_Address is not None:
info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
-
+
if 'collection_date' not in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession, ' - technology not found')
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'technology not found'])
+ continue
+
+ if 'host_species' not in info_for_yaml_dict['host']:
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'missing host species'])
continue
with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
-
+
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.sra.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
+
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 40572a3..0bfc455 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
human,http://purl.obolibrary.org/obo/NCBITaxon_9606
Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
+not_created_accession_list = []
accession_with_errors_list = []
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession_version, ' - technology not found')
+ #print(accession_version, ' - technology not found')
+ not_created_accession_list.append([accession_version, 'technology not found'])
continue
with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -389,15 +391,21 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
continue
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.genbank.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
if len(accession_with_errors_list) > 0:
- path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+ path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))