From afbc3ec99f638a2f8df96a8e952b5b9616dc99a8 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 13:31:49 -0400 Subject: Now moves collections into 'validated sequences' project Improve logging for seq service Fix uploader bug Runs workflow with all validated sequences. --- bh20seqanalyzer/main.py | 83 +++++++++++++++++++++++++++++++++++++++---------- bh20sequploader/main.py | 3 +- setup.py | 2 +- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 23e58e9..dae8eca 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -1,29 +1,70 @@ import argparse import arvados +import arvados.collection import time import subprocess import tempfile import json +import logging + +logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) +logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) + +def validate_upload(api, collection, validated_project): + col = arvados.collection.Collection(collection["uuid"]) + + # validate the collection here. Check metadata, etc. + valid = True + + if "sequence.fasta" not in col: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) + if "metadata.jsonld" not in col: + logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + valid = False + + dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], + ["portable_data_hash", "=", col.portable_data_hash()]]).execute() + if dup["items"]: + # This exact collection has been uploaded before. + valid = False + logging.warn("Upload '%s' is duplicate" % collection["name"]) + + if valid: + logging.info("Added '%s' to validated sequences" % collection["name"]) + # Move it to the "validated" project to be included in the next analysis + api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + else: + # It is invalid, delete it. + logging.warn("Deleting '%s'" % collection["name"]) + api.collections().delete(uuid=collection["uuid"]).execute() + + return valid + +def start_analysis(api, + analysis_project, + workflow_uuid, + validated_project): -def start_analysis(api, collection, analysis_project, workflow_uuid): project = api.groups().create(body={ "group_class": "project", - "name": "Analysis of %s" % collection["name"], + "name": "Pangenome analysis", "owner_uuid": analysis_project, }, ensure_unique_name=True).execute() + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + with tempfile.NamedTemporaryFile() as tmp: - inputobj = json.dumps({ - "sequence": { + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ "class": "File", - "location": "keep:%s/sequence.fasta" % collection["portable_data_hash"] - }, - "metadata": { - "class": "File", - "location": "keep:%s/metadata.jsonld" % collection["portable_data_hash"] - } - }, indent=2) - tmp.write(inputobj.encode('utf-8')) + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", "--submit", @@ -32,24 +73,32 @@ def start_analysis(api, collection, analysis_project, workflow_uuid): "--project-uuid=%s" % project["uuid"], "arvwf:%s" % workflow_uuid, tmp.name] - print("Running %s" % ' '.join(cmd)) + logging.info("Running %s" % ' '.join(cmd)) comp = subprocess.run(cmd, capture_output=True) if comp.returncode != 0: - print(comp.stderr.decode('utf-8')) - else: - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": project['uuid']}).execute() + logging.error(comp.stderr.decode('utf-8')) + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') args = parser.parse_args() api = arvados.api() + logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) + while True: new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() + at_least_one_new_valid_seq = False for c in new_collections["items"]: - start_analysis(api, c, args.analysis_project, args.workflow_uuid) + at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + + if at_least_one_new_valid_seq: + start_analysis(api, args.analysis_project, + args.workflow_uuid, + args.validated_project) time.sleep(10) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,4 +49,5 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 9e73ff0..0685d37 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import setuptools.command.egg_info as egg_info_cmd from setuptools import setup SETUP_DIR = os.path.dirname(__file__) -README = os.path.join(SETUP_DIR, "README.rst") +README = os.path.join(SETUP_DIR, "README.md") try: import gittaggers -- cgit v1.2.3 From 40df65dec296b81650987c8ee4f832b703ab8f74 Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 7 Apr 2020 19:51:49 +0200 Subject: adding dummy metadata qc to the project --- bh20sequploader/qc_metadata.py | 13 +++++++++++++ example/dummyschema.yaml | 16 ++++++++++++++++ example/metadata.json | 0 example/metadata.yaml | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 bh20sequploader/qc_metadata.py create mode 100644 example/dummyschema.yaml delete mode 100644 example/metadata.json create mode 100644 example/metadata.yaml diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..0632777 --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,13 @@ +import yamale + +## NOTE: this is just a DUMMY. Everything about this can and will change +def qc_metadata(metadatafile): + print("Start metadata validation...") + schema = yamale.make_schema('../example/dummyschema.yaml') + data = yamale.make_data(metadatafile) + # Validate data against the schema. Throws a ValueError if data is invalid. + yamale.validate(schema, data) + print("...complete!") + +#qc_metadata("../example/metadata.yaml") + diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml new file mode 100644 index 0000000..e428324 --- /dev/null +++ b/example/dummyschema.yaml @@ -0,0 +1,16 @@ +#sampleInformation: include('sampleInformation') +#InstituteInformation: include('InstituteInformation') +--- +sampleInformation: + location : str() + host : str() + sequenceTechnology: str() + assemblyMethod: str() + +InstituteInformation: + OriginatingLab: str() + SubmittingLab: str() + +VirusDetail: + VirusName: str() + AccessionId: str() diff --git a/example/metadata.json b/example/metadata.json deleted file mode 100644 index e69de29..0000000 diff --git a/example/metadata.yaml b/example/metadata.yaml new file mode 100644 index 0000000..587d0be --- /dev/null +++ b/example/metadata.yaml @@ -0,0 +1,17 @@ +sampleInformation: + location: "USA" + host : "Homo Sapiens" + sequenceTechnology: "Sanger" + assemblyMethod: "CLC Genomics" + +InstituteInformation: + OriginatingLab: "Erik's kitchen" + SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" + +SubmitterInformation: + Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" + submissionDate: "04-04-2020" + +VirusDetail: + VirusName: "hCoV-19/USA/identifer/2020" + AccessionId: "EPI_ISL_Random" -- cgit v1.2.3 From 102a3663123292375440eeb04276d22a5b4645e0 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:27:10 -0400 Subject: Copy recent results to a set destination --- bh20seqanalyzer/main.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index dae8eca..2db97f6 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -11,7 +11,7 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, latest_result_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -79,12 +79,31 @@ def start_analysis(api, logging.error(comp.stderr.decode('utf-8')) +def copy_most_recent_result(api, analysis_project, latest_result_uuid): + most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], + order="created_at desc").execute() + for m in most_recent_analysis["items"]: + cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + wf = cr["items"][0] + src = api.collections().get(uuid=wf["output_uuid"]).execute() + dst = api.collections().get(uuid=latest_result_uuid).execute() + if src["portable_data_hash"] != dst["portable_data_hash"]: + logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) + api.collections().update(uuid=latest_result_uuid, + body={"manifest_text": src["manifest_text"], + "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + break + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -101,4 +120,7 @@ def main(): start_analysis(api, args.analysis_project, args.workflow_uuid, args.validated_project) + + copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + time.sleep(10) -- cgit v1.2.3 From 4215a82af730ff05b8fe98e226b759413cdf95f7 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:37:19 -0400 Subject: limit 1 --- bh20seqanalyzer/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2db97f6..2513ea3 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -81,7 +81,7 @@ def start_analysis(api, def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], - order="created_at desc").execute() + order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], ["requesting_container_uuid", "=", None]]).execute() -- cgit v1.2.3 From 37652786cb6605a4862e820f2ba85f2fe818952f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 7 Apr 2020 11:58:33 -0700 Subject: Make README more didactic --- README.md | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index ec9afb1..a6fe052 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,162 @@ # Sequence uploader -This repository provides a sequence uploader for the +This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -# Run +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). -Run the uploader with a FASTA file and accompanying metadata: +# Installation - python3 bh20sequploader/main.py example/sequence.fasta example/metadata.json +There are several ways to install the uploader. The most portable is with a [virtualenv](#installation-with-virtualenv). -# Add a workflow +## Installation with `virtualenv` -get your SARS-CoV-2 sequences from GenBank in seqs.fa +1. **Prepare your system.** You need to make sure you have Python, and the ability to install modules such as `pycurl` and `pyopenssl`. On Ubuntu 18.04, you can run: ```sh -minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf -seqwish -s seqs.fa -p seqs.paf -g seqs.gfa -odgi build -g seqs.gfa -s -o seqs.odgi -odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev ``` -from https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes +2. **Create and enter your virtualenv.** Go to some memorable directory and make and enter a virtualenv: -# Installation +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +3. **Install the tool.** Once in your virtualenv, install this project: + +```sh +pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +**Make sure you are in your virtualenv whenever you run the tool!** If you ever can't run the tool, and your prompt doesn't say `(venv)`, try going to the directory where you put the virtualenv and running `. venv/bin/activate`. It only works for the current terminal window; you will need to run it again if you open a new terminal. + +## Installation with `pip3 --user` + +If you don't want to have to enter a virtualenv every time you use the uploader, you can use the `--user` feature of `pip3` to install the tool for your user. + +1. **Prepare your system.** Just as for the `virtualenv` method, you need to install some dependencies. On Ubuntu 18.04, you can run: + +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Install the tool.** You can run: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +3. **Make sure the tool is on your `PATH`.** THe `pip3` command will install the uploader in `.local/bin` inside your home directory. Your shell may not know to look for commands there by default. To fix this for the terminal you currently have open, run: + +```sh +export PATH=$PATH:$HOME/.local/bin +``` + +To make this change permanent, assuming your shell is Bash, run: + +```sh +echo 'export PATH=$PATH:$HOME/.local/bin' >>~/.bashrc +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. -This tool requires the arvados Python module which can be installed -using .deb or .rpm packages through -https://doc.arvados.org/v2.0/sdk/python/sdk-python.html. The actual -code lives [here](https://github.com/arvados/arvados/tree/master/sdk/python) and -suggests a local install using +## Installation from Source for Development - apt-get install libcurl4-openssl-dev libssl1.0-dev - pip3 install --user arvados-python-client +If you plan to contribute to the project, you may want to install an editable copy from source. With this method, changes to the source code are automatically reflected in the installed copy of the tool. -Next update +1. **Prepare your system.** On Ubuntu 18.04, you can run: - export PATH=$PATH:$HOME/.local/bin +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Clone and enter the repository.** You can run: + +```sh +git clone https://github.com/arvados/bh20-seq-resource.git +cd bh20-seq-resource +``` + +3. **Create and enter a virtualenv.** Go to some memorable directory and make and enter a virtualenv: + +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +4. **Install the checked-out repository in editable mode.** Once in your virtualenv, install with this special pip command: + +```sh +pip3 install -e . +``` + +5. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +## Installation with GNU Guix -## Install with GNU Guix +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. -Set up a container: +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +# Usage + +Run the uploader with a FASTA file and accompanying metadata file in [JSON-LD format](https://json-ld.org/): + +```sh +bh20-seq-uploader example/sequence.fasta example/metadata.json +``` + +## Workflow for Generating a Pangenome + +All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. + +Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: + +```sh +minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf +seqwish -s seqs.fa -p seqs.paf -g seqs.gfa +odgi build -g seqs.gfa -s -o seqs.odgi +odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +``` - ~/opt/guix/bin/guix environment -C guix --ad-hoc python openssl python-pycurl nss-certs - pip3 install --user arvados-python-client +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). -Pip installed the following modules - arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 -- cgit v1.2.3 From 07bc4c65535437b8e9e0744f08da8cea541d0116 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 15:28:42 -0400 Subject: Add metadata validation with schema-salad --- bh20seqanalyzer/main.py | 11 ++++++++--- bh20sequploader/bh20seq-schema.yml | 36 ++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 +++++-- bh20sequploader/qc_metadata.py | 26 +++++++++++++++++--------- example/dummyschema.yaml | 16 ---------------- setup.py | 3 ++- 6 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 bh20sequploader/bh20seq-schema.yml delete mode 100644 example/dummyschema.yaml diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2513ea3..78e32c9 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -6,12 +6,14 @@ import subprocess import tempfile import json import logging +import ruamel.yaml +from bh20sequploader.qc_metadata import qc_metadata logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project, latest_result_uuid): +def validate_upload(api, collection, validated_project): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -20,9 +22,12 @@ def validate_upload(api, collection, validated_project, latest_result_uuid): if "sequence.fasta" not in col: valid = False logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) - if "metadata.jsonld" not in col: - logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + if "metadata.yaml" not in col: + logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False + else: + metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) + valid = qc_metadata(metadata_content) and valid dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..6e0973a --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,36 @@ +$graph: + +- name: sampleInformationSchema + type: record + fields: + location: string + host: string + sequenceTechnology: string + assemblyMethod: string + +- name: InstituteInformationSchema + type: record + fields: + OriginatingLab: string + SubmittingLab: string + +- name: SubmitterInformationSchema + type: record + fields: + Submitter: string + submissionDate: string + +- name: VirusDetailSchema + type: record + fields: + VirusName: string + AccessionId: string + +- name: MainSchema + type: record + documentRoot: true + fields: + sampleInformation: sampleInformationSchema + InstituteInformation: InstituteInformationSchema + SubmitterInformation: SubmitterInformationSchema + VirusDetail: VirusDetailSchema diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index d3ebc0c..8b8fefe 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,6 +20,8 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + qc_metadata(args.metadata.name) + col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -29,8 +32,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 0632777..78b31b2 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,13 +1,21 @@ -import yamale +import schema_salad.schema +import logging +import pkg_resources -## NOTE: this is just a DUMMY. Everything about this can and will change def qc_metadata(metadatafile): - print("Start metadata validation...") - schema = yamale.make_schema('../example/dummyschema.yaml') - data = yamale.make_data(metadatafile) - # Validate data against the schema. Throws a ValueError if data is invalid. - yamale.validate(schema, data) - print("...complete!") + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) -#qc_metadata("../example/metadata.yaml") + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except: + return False diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml deleted file mode 100644 index e428324..0000000 --- a/example/dummyschema.yaml +++ /dev/null @@ -1,16 +0,0 @@ -#sampleInformation: include('sampleInformation') -#InstituteInformation: include('InstituteInformation') ---- -sampleInformation: - location : str() - host : str() - sequenceTechnology: str() - assemblyMethod: str() - -InstituteInformation: - OriginatingLab: str() - SubmittingLab: str() - -VirusDetail: - VirusName: str() - AccessionId: str() diff --git a/setup.py b/setup.py index 0685d37..48c25aa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client"] +install_requires = ["arvados-python-client", "schema-salad"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -30,6 +30,7 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer"], + package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], -- cgit v1.2.3 From 14ff178ed7f77a996f47e2115e2a1429f6b69356 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 12:12:49 -0700 Subject: Spell correctly --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6fe052..1448f4c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [upload your data](#usage). # Installation -- cgit v1.2.3 From 414c308b8860d1b20481a2ec3b2f6381e4f6061b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 14:11:39 -0700 Subject: Initial commit of working frontend --- __pycache__/main.cpython-36.pyc | Bin 0 -> 2716 bytes main.py | 98 ++++++++++++++++++++++++++++++++++++++++ pages/index.html | 28 ++++++++++++ templates/error.html | 19 ++++++++ templates/success.html | 24 ++++++++++ 5 files changed, 169 insertions(+) create mode 100644 __pycache__/main.cpython-36.pyc create mode 100644 main.py create mode 100644 pages/index.html create mode 100644 templates/error.html create mode 100644 templates/success.html diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc new file mode 100644 index 0000000..250c562 Binary files /dev/null and b/__pycache__/main.cpython-36.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..630669c --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +import tempfile +import shutil +import subprocess +import os +from flask import Flask, request, redirect, send_file, send_from_directory, render_template + +app = Flask(__name__, static_url_path='/static', static_folder='static') + +# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. +# We will enforce the limit ourselves and set a higher safety limit here. +app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 + +# When a file is too big we get a 413. +@app.errorhandler(413) +def handle_large_file(e): + return (render_template('error.html', + error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + +@app.route('/') +def send_form(): + """ + Send the file upload form/front page. + """ + return send_from_directory('pages', 'index.html') + +class FileTooBigError(RuntimeError): + """ + Raised when the user gives a file that is too large. + """ + pass + +def copy_with_limit(in_file, out_file, limit=1024*1024): + """ + Copy a file stream, and raise FileTooBigError if the file is too big. + """ + + bytes_used = 0 + buf_size = 65536 + + buf = in_file.read(buf_size) + bytes_used += len(buf) + while buf: + if bytes_used > limit: + raise FileTooBigError('Hit file length limit') + out_file.write(buf) + buf = in_file.read(buf_size) + bytes_used += len(buf) + + +@app.route('/submit', methods=['POST']) +def recieve_files(): + """ + Recieve the uploaded files. + """ + + # We're going to work in one directory per request + dest_dir = tempfile.mkdtemp() + try: + + print(request) + print(request.files) + + if 'fasta' not in request.files: + return (render_template('error.html', + error_message="You did not include a FASTA file."), 403) + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') + + try: + with open(fasta_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('fasta').stream, out_stream) + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + + # Try and upload files to Arvados + result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode != 0: + # It didn't work. Complain. + error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) + return (render_template('error.html', error_message=error_message), 403) + else: + # It worked. Say so. + return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + finally: + shutil.rmtree(dest_dir) + + + + diff --git a/pages/index.html b/pages/index.html new file mode 100644 index 0000000..2269791 --- /dev/null +++ b/pages/index.html @@ -0,0 +1,28 @@ + + + + + + Simple Web Uploader for Public SARS-CoV-2 Sequence Resource + + +

Simple Web Uploader for Public SARS-CoV-2 Sequence Resource

+
+

+ This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. +

+
+
+ +
+ +
+ +
+ +
+ +
+
+ + diff --git a/templates/error.html b/templates/error.html new file mode 100644 index 0000000..c2ab0a4 --- /dev/null +++ b/templates/error.html @@ -0,0 +1,19 @@ + + + + + + Upload Failed + + +

Upload Failed

+
+

+ Your upload has failed. {{error_message}} +

+

+ Click here to try again. +

+
+ + diff --git a/templates/success.html b/templates/success.html new file mode 100644 index 0000000..1be7861 --- /dev/null +++ b/templates/success.html @@ -0,0 +1,24 @@ + + + + + + Upload Successful + + +

Upload Successful

+
+

+ Your files have been uploaded. They should soon appear as part of the Public SARS-CoV-2 Sequence Resource. +

+

+ The upload log was: +

+
{{log}}
+
+

+ Click here to upload more files. +

+
+ + -- cgit v1.2.3 From 9458ed33da08c787c4bb20af7b4108c93334b351 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 8 Apr 2020 17:41:19 -0400 Subject: Fastq now runs through fastq2fasta pipeline then gets added to pangenome analysis. --- bh20seqanalyzer/main.py | 141 ++++++++++++++++++++++++++++++----------- bh20sequploader/main.py | 14 +++- bh20sequploader/qc_metadata.py | 6 +- 3 files changed, 120 insertions(+), 41 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 78e32c9..1a8965b 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -13,21 +13,30 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, + fastq_project, fastq_workflow_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. valid = True - if "sequence.fasta" not in col: - valid = False - logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) if "metadata.yaml" not in col: logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) - valid = qc_metadata(metadata_content) and valid + #valid = qc_metadata(metadata_content) and valid + if not valid: + logging.warn("Failed metadata qc") + + if valid: + if "sequence.fasta" not in col: + if "reads.fastq" in col: + start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) + return False + else: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() @@ -39,7 +48,9 @@ def validate_upload(api, collection, validated_project): if valid: logging.info("Added '%s' to validated sequences" % collection["name"]) # Move it to the "validated" project to be included in the next analysis - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + api.collections().update(uuid=collection["uuid"], body={ + "owner_uuid": validated_project, + "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute() else: # It is invalid, delete it. logging.warn("Deleting '%s'" % collection["name"]) @@ -47,28 +58,15 @@ def validate_upload(api, collection, validated_project): return valid -def start_analysis(api, - analysis_project, - workflow_uuid, - validated_project): +def run_workflow(api, parent_project, workflow_uuid, name, inputobj): project = api.groups().create(body={ "group_class": "project", - "name": "Pangenome analysis", - "owner_uuid": analysis_project, + "name": name, + "owner_uuid": parent_project, }, ensure_unique_name=True).execute() - validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) - with tempfile.NamedTemporaryFile() as tmp: - inputobj = { - "inputReads": [] - } - for v in validated: - inputobj["inputReads"].append({ - "class": "File", - "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] - }) tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", @@ -83,32 +81,95 @@ def start_analysis(api, if comp.returncode != 0: logging.error(comp.stderr.decode('utf-8')) + return project + + +def start_fastq_to_fasta(api, collection, + analysis_project, + fastq_workflow_uuid): + newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", { + "fastq_forward": { + "class": "File", + "location": "keep:%s/reads.fastq" % collection["portable_data_hash"] + }, + "metadata": { + "class": "File", + "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"] + }, + "ref_fasta": { + "class": "File", + "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta" + } + }) + api.collections().update(uuid=collection["uuid"], + body={"owner_uuid": newproject["uuid"]}).execute() + +def start_pangenome_analysis(api, + analysis_project, + pangenome_workflow_uuid, + validated_project): + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ + "class": "File", + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) + + +def get_workflow_output_from_project(api, uuid): + cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + return cr["items"][0] + else: + return None + def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: - cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], - ["requesting_container_uuid", "=", None]]).execute() - if cr["items"] and cr["items"][0]["output_uuid"]: - wf = cr["items"][0] + wf = get_workflow_output_from_project(api, m["uuid"]) + if wf: src = api.collections().get(uuid=wf["output_uuid"]).execute() dst = api.collections().get(uuid=latest_result_uuid).execute() if src["portable_data_hash"] != dst["portable_data_hash"]: logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) api.collections().update(uuid=latest_result_uuid, body={"manifest_text": src["manifest_text"], - "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute() break +def move_fastq_to_fasta_results(api, analysis_project, uploader_project): + projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project], + ["properties.moved_output", "!=", True]], + order="created_at desc",).execute() + for p in projects["items"]: + wf = get_workflow_output_from_project(api, p["uuid"]) + if wf: + logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"]) + api.collections().update(uuid=wf["output_uuid"], + body={"owner_uuid": uploader_project}).execute() + p["properties"]["moved_output"] = True + api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute() + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') - parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--pangenome-analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--fastq-project', type=str, default='lugli-j7d0g-xcjxp4oox2u1w8u', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') - parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') - parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') + + parser.add_argument('--pangenome-workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--fastq-workflow-uuid', type=str, default='lugli-7fd4e-2zp9q4jo5xpif9y', help='') + + parser.add_argument('--latest-result-collection', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -116,16 +177,24 @@ def main(): logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) while True: + move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project) + new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() at_least_one_new_valid_seq = False for c in new_collections["items"]: - at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + at_least_one_new_valid_seq = validate_upload(api, c, + args.validated_project, + args.fastq_project, + args.fastq_workflow_uuid) or at_least_one_new_valid_seq if at_least_one_new_valid_seq: - start_analysis(api, args.analysis_project, - args.workflow_uuid, - args.validated_project) + start_pangenome_analysis(api, + args.pangenome_analysis_project, + args.pangenome_workflow_uuid, + args.validated_project) - copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + copy_most_recent_result(api, + args.pangenome_analysis_project, + args.latest_result_collection) - time.sleep(10) + time.sleep(15) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 8b8fefe..56cbe22 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -20,12 +20,18 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - qc_metadata(args.metadata.name) + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): + target = "sequence.fasta" + elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + target = "reads.fastq" + + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: @@ -52,5 +58,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) + print("Done") + if __name__ == "__main__": main() diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 78b31b2..ebe4dfc 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,6 +1,7 @@ import schema_salad.schema import logging import pkg_resources +import logging def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") @@ -17,5 +18,6 @@ def qc_metadata(metadatafile): try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True - except: - return False + except Exception as e: + logging.warn(e) + return False -- cgit v1.2.3 From ce80c29ef5c93aed80ab3b98a3c2eedb740e32b6 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 15:07:39 -0700 Subject: Don't assert that the metadata is really JSON-LD --- pages/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pages/index.html b/pages/index.html index 2269791..c2e5b64 100644 --- a/pages/index.html +++ b/pages/index.html @@ -17,7 +17,7 @@

- +

-- cgit v1.2.3 From 60420f991a5bd3502bc6b89747d408da0d922839 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 15:11:51 -0700 Subject: Add context links --- pages/index.html | 1 + 1 file changed, 1 insertion(+) diff --git a/pages/index.html b/pages/index.html index c2e5b64..543ab7d 100644 --- a/pages/index.html +++ b/pages/index.html @@ -24,5 +24,6 @@
+ Source · Made for COVID-19-BH20 -- cgit v1.2.3 From d7498093d0f5e0db052ef88815d57c2648d09425 Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 14:34:54 +0200 Subject: Updating schema and examples. This is still work in progress but we get there --- bh20sequploader/bh20seq-schema.yml | 60 ++++++++++++++++++++++++++------------ example/metadata.yaml | 49 ++++++++++++++++++++++--------- example/minimal_example.yaml | 14 +++++++++ 3 files changed, 91 insertions(+), 32 deletions(-) create mode 100644 example/minimal_example.yaml diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 6e0973a..38cfb48 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,36 +1,60 @@ $graph: -- name: sampleInformationSchema +- name: hostSchema type: record fields: - location: string - host: string - sequenceTechnology: string - assemblyMethod: string + host_id: string + host_species: string + host_common_name: string? + host_sex: string? + host_age: int? + host_age_unit: string? + host_health_status: string? + host_treatment: string? + additional_host_information: string? -- name: InstituteInformationSchema +- name: sampleSchema type: record fields: - OriginatingLab: string - SubmittingLab: string + collector_name: string + collecting_institution: string + specimen_source: string? + collection_date: string? + collection_location: string? + sample_storage_conditions: string? + additional_collection_information: string? -- name: SubmitterInformationSchema +- name: virusSchema type: record fields: - Submitter: string - submissionDate: string + virus_species: string? + virus_strain: string? -- name: VirusDetailSchema +- name: technologySchema type: record fields: - VirusName: string - AccessionId: string + sample_sequencing_technology: string + sequence_assembly_method: string? + sequencing_coverage: string? + +- name: submitterSchema + type: record + fields: + submitter_name: string + submitter_address: string? + originating_lab: string + lab_address: string? + provider_sample_id: string? + submitter_sample_id: string? + authors: string? + submitter_id: string? - name: MainSchema type: record documentRoot: true fields: - sampleInformation: sampleInformationSchema - InstituteInformation: InstituteInformationSchema - SubmitterInformation: SubmitterInformationSchema - VirusDetail: VirusDetailSchema + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml index 587d0be..8a93379 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,17 +1,38 @@ -sampleInformation: - location: "USA" - host : "Homo Sapiens" - sequenceTechnology: "Sanger" - assemblyMethod: "CLC Genomics" +host: + host_id: XX1 + host_species: string + host_common_name: string + host_sex: string + host_age: 20 + host_age_unit: string + host_health_status: string + host_treatment: string + additional_host_information: string -InstituteInformation: - OriginatingLab: "Erik's kitchen" - SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" +sample: + collector_name: XXX + collecting_institution: XXX + specimen_source: XXX + collection_date: XXX + collection_location: XXX + sample_storage_conditions: XXX + additional_collection_information: XXX -SubmitterInformation: - Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" - submissionDate: "04-04-2020" +virus: + virus_species: XX + virus_strain: XX -VirusDetail: - VirusName: "hCoV-19/USA/identifer/2020" - AccessionId: "EPI_ISL_Random" +technology: + sample_sequencing_technology: XX + sequence_assembly_method: XX + sequencing_coverage: 70x + +submitter: + submitter_name: tester + submitter_address: testerAdd + originating_lab: testLab + lab_address: labAdd + provider_sample_id: string + submitter_sample_id: string + authors: testAuthor + submitter_id: X12 \ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml new file mode 100644 index 0000000..201b080 --- /dev/null +++ b/example/minimal_example.yaml @@ -0,0 +1,14 @@ +host: + host_id: XX + host_species: string + +sample: + collector_name: XXX + collecting_institution: XXX + +technology: + sample_sequencing_technology: XX + +submitter: + submitter_name: tester + originating_lab: testLab \ No newline at end of file -- cgit v1.2.3 From deedb2ed7046bbe81136b8d9d1edc353984d356b Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 20:38:55 +0200 Subject: Adding functionality of turning keys into ontology terms (URI). This is work in progress - of course! --- bh20sequploader/bh20seq-schema.yml | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 38cfb48..fd9e854 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -3,14 +3,20 @@ $graph: - name: hostSchema type: record fields: + host_species: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 host_id: string - host_species: string host_common_name: string? host_sex: string? host_age: int? host_age_unit: string? host_health_status: string? - host_treatment: string? + host_treatment: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 additional_host_information: string? - name: sampleSchema @@ -20,7 +26,10 @@ $graph: collecting_institution: string specimen_source: string? collection_date: string? - collection_location: string? + collection_location: + type: string? + jsonldPredicate: + _id: https://schema.org/fromLocation sample_storage_conditions: string? additional_collection_information: string? @@ -33,9 +42,18 @@ $graph: - name: technologySchema type: record fields: - sample_sequencing_technology: string - sequence_assembly_method: string? - sequencing_coverage: string? + sample_sequencing_technology: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + sequence_assembly_method: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 - name: submitterSchema type: record -- cgit v1.2.3 From 03e857c1a477b04db11cf610760b1f2db7b859c5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:43:42 -0700 Subject: Add auto-generated fillable metadata form --- __pycache__/main.cpython-36.pyc | Bin 2716 -> 6764 bytes main.py | 191 +++++++++++++++++++++++++++++++++++++--- pages/index.html | 29 ------ templates/form.html | 95 ++++++++++++++++++++ 4 files changed, 272 insertions(+), 43 deletions(-) delete mode 100644 pages/index.html create mode 100644 templates/form.html diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc index 250c562..0f929ad 100644 Binary files a/__pycache__/main.cpython-36.pyc and b/__pycache__/main.cpython-36.pyc differ diff --git a/main.py b/main.py index 630669c..d0f2793 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,12 @@ +import collections import tempfile import shutil import subprocess import os +import re +import string +import yaml +import urllib.request from flask import Flask, request, redirect, send_file, send_from_directory, render_template app = Flask(__name__, static_url_path='/static', static_folder='static') @@ -16,12 +21,118 @@ def handle_large_file(e): return (render_template('error.html', error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + +def type_to_heading(type_name): + """ + Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. + """ + + # Remove camel case + decamel = re.sub('([A-Z])', r' \1', type_name) + # Split + parts = decamel.split() + # Capitalize words and remove unwanted components + filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] + # Reassemble + return ' '.join(filtered) + +def name_to_label(field_name): + """ + Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. + """ + + return string.capwords(field_name.replace('_', ' ')) + +def generate_form(schema): + """ + Linearize the schema and send a bunch of dicts. + Each dict either has a 'heading' (in which case we put a heading for a + form section in the template) or an 'id', 'label', 'type', and 'required' + (in which case we make a form field in the template). + """ + + # Get the list of form components, one of which is the root + components = schema.get('$graph', []) + + # Find the root + root_name = None + # And also index components by type name + by_name = {} + for component in components: + # Get the name of each + component_name = component.get('name', None) + if isinstance(component_name, str): + # And remember how to map back form it + by_name[component_name] = component + if component.get('documentRoot', False): + # Find whichever one is the root + root_name = component_name + + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): + """ + Do a traversal of the component tree. + Yield a bunch of form item dicts, in order. + Form IDs are .-separated keypaths for where they are in the structure. + parent_keys is the path of field names to where we are in the root record's document tree. + """ + + if len(parent_keys) > 1: + # First make a heading, if we aren't the very root of the form + yield {'heading': type_to_heading(type_name)} + + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): + # For each field + + ref_url = None + if not isinstance(field_type, str): + # If the type isn't a string + # See if it has a more info/what goes here URL + ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) + # Grab out its type field + field_type = field_type.get('type', '') + + # Decide if the field is optional (type ends in ?) + optional = False + if len(field_type) > 0 and field_type[-1] == '?': + # It's optional + optional = True + # Drop the ? + field_type = field_type[:-1] + + if field_type in by_name: + # This is a subrecord. We need to recurse + for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): + yield item + else: + # We know how to make a string input + record = {} + record['id'] = '.'.join(parent_keys + [field_name]) + record['label'] = name_to_label(field_name) + record['required'] = not optional and not subtree_optional + if ref_url: + record['ref_url'] = ref_url + if field_type == 'string': + record['type'] = 'text' # HTML input type + elif field_type == 'int': + record['type'] = 'number' + else: + raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) + yield record + + return list(walk_fields(root_name)) + +# At startup, we need to load the current metadata schema so we can make a form for it +METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) +FORM_ITEMS = generate_form(METADATA_SCHEMA) + @app.route('/') def send_form(): """ Send the file upload form/front page. """ - return send_from_directory('pages', 'index.html') + + return render_template('form.html', fields=FORM_ITEMS) class FileTooBigError(RuntimeError): """ @@ -46,6 +157,20 @@ def copy_with_limit(in_file, out_file, limit=1024*1024): buf = in_file.read(buf_size) bytes_used += len(buf) +def parse_input(input_string, html_type): + """ + Parse an input from the given HTML input type into a useful Python type. + + Raise ValueError if something does not parse. + Raise NotImplementedError if we forgot to implement a type. + """ + + if html_type == 'text': + return input_string + elif html_type == 'number': + return int(input_string) + else: + raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) @app.route('/submit', methods=['POST']) def recieve_files(): @@ -55,30 +180,68 @@ def recieve_files(): # We're going to work in one directory per request dest_dir = tempfile.mkdtemp() + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') try: - - print(request) - print(request.files) - if 'fasta' not in request.files: return (render_template('error.html', error_message="You did not include a FASTA file."), 403) - if 'metadata' not in request.files: - return (render_template('error.html', - error_message="You did not include a metadata file."), 403) - - fasta_dest = os.path.join(dest_dir, 'fasta.fa') - metadata_dest = os.path.join(dest_dir, 'metadata.json') - try: with open(fasta_dest, 'wb') as out_stream: copy_with_limit(request.files.get('fasta').stream, out_stream) - with open(metadata_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('metadata').stream, out_stream) except FileTooBigError as e: # Delegate to the 413 error handler return handle_large_file(e) + if request.form.get('metadata_type', None) == 'upload': + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + try: + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + elif request.form.get('metadata_type', None) == 'fill': + # Build a metadata dict + metadata = {} + + for item in FORM_ITEMS: + # Pull all the field values we wanted from the form + if 'heading' in item: + continue + + if item['id'] in request.form and len(request.form[item['id']]) > 0: + # We have this thing. Make a place in the dict tree for it. + parts = item['id'].split('.') + key = parts[-1] + # Remove leading 'metadata' + path = parts[1:-1] + dest_dict = metadata + for parent in path: + if parent not in dest_dict: + dest_dict[parent] = {} + dest_dict = dest_dict[parent] + + try: + # Now finally add the item + dest_dict[key] = parse_input(request.form[item['id']], item['type']) + except ValueError: + # We don't like that input + return (render_template('error.html', + error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) + elif item['required']: + return (render_template('error.html', + error_message="You omitted the required metadata item {}".format(item['id'])), 403) + + # Now serialize the file with all the items + with open(metadata_dest, 'w') as out_stream: + yaml.dump(metadata, out_stream) + else: + return (render_template('error.html', + error_message="You did not include metadata."), 403) + # Try and upload files to Arvados result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/pages/index.html b/pages/index.html deleted file mode 100644 index 543ab7d..0000000 --- a/pages/index.html +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - Simple Web Uploader for Public SARS-CoV-2 Sequence Resource - - -

Simple Web Uploader for Public SARS-CoV-2 Sequence Resource

-
-

- This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. -

-
-
- -
- -
- -
- -
- -
-
- Source · Made for COVID-19-BH20 - - diff --git a/templates/form.html b/templates/form.html new file mode 100644 index 0000000..ec54de5 --- /dev/null +++ b/templates/form.html @@ -0,0 +1,95 @@ + + + + + + Simple Web Uploader for Public SARS-CoV-2 Sequence Resource + + +

Simple Web Uploader for Public SARS-CoV-2 Sequence Resource

+
+

+ This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. +

+
+
+ +
+ +
+ + +
+ + +
+ + +
+ +
+
+ +
+ +
+
+
+ +
+
+ {% for record in fields %} + {% if 'heading' in record %} +

{{ record['heading'] }}

+ {% else %} + +
+ +
+ {% endif %} + {% endfor %} +
+
+ + +
+
+ Source · Made for COVID-19-BH20 + + + -- cgit v1.2.3 From 51b5686f1df140628f1b39ecf40b45fbc0d0a59a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:45:10 -0700 Subject: Don't include pyc --- __pycache__/main.cpython-36.pyc | Bin 6764 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __pycache__/main.cpython-36.pyc diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc deleted file mode 100644 index 0f929ad..0000000 Binary files a/__pycache__/main.cpython-36.pyc and /dev/null differ -- cgit v1.2.3 From 062230b12bb71c4b906318f1de3d67c0fd26f3ba Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:57:49 -0700 Subject: Make schema link nicer and add example files --- templates/form.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/form.html b/templates/form.html index ec54de5..4ad41e2 100644 --- a/templates/form.html +++ b/templates/form.html @@ -29,7 +29,7 @@
- +

-- cgit v1.2.3 From dbe094a150d6c969b3d69f112b3538e6a87a74a2 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 9 Apr 2020 15:59:46 -0400 Subject: Add "sequencefile" for the metadata subject. --- bh20sequploader/bh20seq-schema.yml | 13 ++++++++++++- example/metadata.yaml | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index fd9e854..5c962d1 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,3 +1,8 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ $graph: - name: hostSchema @@ -75,4 +80,10 @@ $graph: sample: sampleSchema virus: virusSchema? technology: technologySchema - submitter: submitterSchema \ No newline at end of file + submitter: submitterSchema + sequencefile: + doc: The subject (eg the fasta/fastq file) that this metadata describes + type: string? + jsonldPredicate: + _id: "@id" + _type: "@id" diff --git a/example/metadata.yaml b/example/metadata.yaml index 8a93379..41ff93e 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -35,4 +35,4 @@ submitter: provider_sample_id: string submitter_sample_id: string authors: testAuthor - submitter_id: X12 \ No newline at end of file + submitter_id: X12 -- cgit v1.2.3 From b71cbe74aca99426872447b6dd343a962fe0a528 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 16:25:34 -0500 Subject: Spacing and typo --- main.py | 70 +++++++++++++++++++++++++++++++---------------------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/main.py b/main.py index d0f2793..0d9b37a 100644 --- a/main.py +++ b/main.py @@ -26,7 +26,7 @@ def type_to_heading(type_name): """ Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. """ - + # Remove camel case decamel = re.sub('([A-Z])', r' \1', type_name) # Split @@ -35,12 +35,12 @@ def type_to_heading(type_name): filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] # Reassemble return ' '.join(filtered) - + def name_to_label(field_name): """ Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. """ - + return string.capwords(field_name.replace('_', ' ')) def generate_form(schema): @@ -50,10 +50,10 @@ def generate_form(schema): form section in the template) or an 'id', 'label', 'type', and 'required' (in which case we make a form field in the template). """ - + # Get the list of form components, one of which is the root components = schema.get('$graph', []) - + # Find the root root_name = None # And also index components by type name @@ -67,8 +67,8 @@ def generate_form(schema): if component.get('documentRoot', False): # Find whichever one is the root root_name = component_name - - + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): """ Do a traversal of the component tree. @@ -76,14 +76,14 @@ def generate_form(schema): Form IDs are .-separated keypaths for where they are in the structure. parent_keys is the path of field names to where we are in the root record's document tree. """ - + if len(parent_keys) > 1: # First make a heading, if we aren't the very root of the form yield {'heading': type_to_heading(type_name)} - + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): # For each field - + ref_url = None if not isinstance(field_type, str): # If the type isn't a string @@ -91,7 +91,7 @@ def generate_form(schema): ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) # Grab out its type field field_type = field_type.get('type', '') - + # Decide if the field is optional (type ends in ?) optional = False if len(field_type) > 0 and field_type[-1] == '?': @@ -99,7 +99,7 @@ def generate_form(schema): optional = True # Drop the ? field_type = field_type[:-1] - + if field_type in by_name: # This is a subrecord. We need to recurse for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): @@ -119,9 +119,9 @@ def generate_form(schema): else: raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) yield record - + return list(walk_fields(root_name)) - + # At startup, we need to load the current metadata schema so we can make a form for it METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) FORM_ITEMS = generate_form(METADATA_SCHEMA) @@ -131,23 +131,23 @@ def send_form(): """ Send the file upload form/front page. """ - + return render_template('form.html', fields=FORM_ITEMS) - + class FileTooBigError(RuntimeError): """ Raised when the user gives a file that is too large. """ pass - + def copy_with_limit(in_file, out_file, limit=1024*1024): """ Copy a file stream, and raise FileTooBigError if the file is too big. """ - + bytes_used = 0 buf_size = 65536 - + buf = in_file.read(buf_size) bytes_used += len(buf) while buf: @@ -156,28 +156,28 @@ def copy_with_limit(in_file, out_file, limit=1024*1024): out_file.write(buf) buf = in_file.read(buf_size) bytes_used += len(buf) - + def parse_input(input_string, html_type): """ Parse an input from the given HTML input type into a useful Python type. - + Raise ValueError if something does not parse. Raise NotImplementedError if we forgot to implement a type. """ - + if html_type == 'text': return input_string elif html_type == 'number': return int(input_string) else: raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) - + @app.route('/submit', methods=['POST']) -def recieve_files(): +def receive_files(): """ - Recieve the uploaded files. + Receive the uploaded files. """ - + # We're going to work in one directory per request dest_dir = tempfile.mkdtemp() fasta_dest = os.path.join(dest_dir, 'fasta.fa') @@ -192,7 +192,7 @@ def recieve_files(): except FileTooBigError as e: # Delegate to the 413 error handler return handle_large_file(e) - + if request.form.get('metadata_type', None) == 'upload': if 'metadata' not in request.files: return (render_template('error.html', @@ -206,12 +206,12 @@ def recieve_files(): elif request.form.get('metadata_type', None) == 'fill': # Build a metadata dict metadata = {} - + for item in FORM_ITEMS: # Pull all the field values we wanted from the form if 'heading' in item: continue - + if item['id'] in request.form and len(request.form[item['id']]) > 0: # We have this thing. Make a place in the dict tree for it. parts = item['id'].split('.') @@ -223,7 +223,7 @@ def recieve_files(): if parent not in dest_dict: dest_dict[parent] = {} dest_dict = dest_dict[parent] - + try: # Now finally add the item dest_dict[key] = parse_input(request.form[item['id']], item['type']) @@ -234,18 +234,18 @@ def recieve_files(): elif item['required']: return (render_template('error.html', error_message="You omitted the required metadata item {}".format(item['id'])), 403) - + # Now serialize the file with all the items with open(metadata_dest, 'w') as out_stream: yaml.dump(metadata, out_stream) else: return (render_template('error.html', error_message="You did not include metadata."), 403) - + # Try and upload files to Arvados result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - + if result.returncode != 0: # It didn't work. Complain. error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) @@ -255,7 +255,3 @@ def recieve_files(): return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) finally: shutil.rmtree(dest_dir) - - - - -- cgit v1.2.3 From e110b52bae5a4a62ccc53970a7f26899e9debe7e Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 9 Apr 2020 17:48:15 -0400 Subject: Propagate metadata to pangenome so it can be merged by workflow --- bh20seqanalyzer/main.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 1a8965b..2030c1e 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -110,13 +110,20 @@ def start_pangenome_analysis(api, validated_project): validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) inputobj = { - "inputReads": [] + "inputReads": [], + "metadata": [], + "subjects": [] } for v in validated: inputobj["inputReads"].append({ "class": "File", "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] }) + inputobj["metadata"].append({ + "class": "File", + "location": "keep:%s/metadata.yaml" % v["portable_data_hash"] + }) + inputobj["subjects"].append("keep:%s/sequence.fasta" % v["portable_data_hash"]) run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) -- cgit v1.2.3 From bf93a6a2fec690eee4bff4891469cd5947102b3a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 17:02:38 -0500 Subject: Moved Guix documentation into separate file (as it confused people ;) --- README.md | 21 +++++---------------- bh20sequploader/main.py | 2 +- doc/INSTALL.md | 31 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 doc/INSTALL.md diff --git a/README.md b/README.md index a6fe052..3a8e5f0 100644 --- a/README.md +++ b/README.md @@ -122,19 +122,7 @@ It should print some instructions about how to use the uploader. ## Installation with GNU Guix -Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. - -1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: - -```sh -~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs -``` - -2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: - -```sh -pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master -``` +For running/developing the uploader with GNU Guix see [INSTALL.md](./doc/INSTALL.md) # Usage @@ -148,7 +136,7 @@ bh20-seq-uploader example/sequence.fasta example/metadata.json All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. -Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: +An example is to get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run a series of commands ```sh minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf @@ -157,6 +145,7 @@ odgi build -g seqs.gfa -s -o seqs.odgi odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 ``` -For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). - +Here we convert such a pipeline into the Common Workflow Language (CWL) and +sources can be found [here](https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate). +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 56cbe22..bf74ea5 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,7 +6,7 @@ import json import urllib.request import socket import getpass -from .qc_metadata import qc_metadata +import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' diff --git a/doc/INSTALL.md b/doc/INSTALL.md new file mode 100644 index 0000000..c5c486c --- /dev/null +++ b/doc/INSTALL.md @@ -0,0 +1,31 @@ +# INSTALLATION + +Other options for running this tool. + +## GNU Guix + +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. + +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user schema-salad arvados-python-client +``` + +Pip installed the following modules + +``` +arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 +``` + +3. Run the tool directly with + +```sh +~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py +``` -- cgit v1.2.3 From 02615e46e56376302ef99f7223f447a070248214 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 17:11:25 -0500 Subject: Notes --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 0d9b37a..b4e8681 100644 --- a/main.py +++ b/main.py @@ -242,7 +242,7 @@ def receive_files(): return (render_template('error.html', error_message="You did not include metadata."), 403) - # Try and upload files to Arvados + # Try and upload files to Arvados using the sequence uploader CLI result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) -- cgit v1.2.3 From 7d26be925f37b1f98cac23b018dd1a72fa506a3f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 15:41:29 -0700 Subject: Put back in directory --- bh20simplewebuploader/main.py | 257 +++++++++++++++++++++++++++ bh20simplewebuploader/templates/error.html | 19 ++ bh20simplewebuploader/templates/form.html | 95 ++++++++++ bh20simplewebuploader/templates/success.html | 24 +++ main.py | 257 --------------------------- templates/error.html | 19 -- templates/form.html | 95 ---------- templates/success.html | 24 --- 8 files changed, 395 insertions(+), 395 deletions(-) create mode 100644 bh20simplewebuploader/main.py create mode 100644 bh20simplewebuploader/templates/error.html create mode 100644 bh20simplewebuploader/templates/form.html create mode 100644 bh20simplewebuploader/templates/success.html delete mode 100644 main.py delete mode 100644 templates/error.html delete mode 100644 templates/form.html delete mode 100644 templates/success.html diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py new file mode 100644 index 0000000..b4e8681 --- /dev/null +++ b/bh20simplewebuploader/main.py @@ -0,0 +1,257 @@ +import collections +import tempfile +import shutil +import subprocess +import os +import re +import string +import yaml +import urllib.request +from flask import Flask, request, redirect, send_file, send_from_directory, render_template + +app = Flask(__name__, static_url_path='/static', static_folder='static') + +# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. +# We will enforce the limit ourselves and set a higher safety limit here. +app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 + +# When a file is too big we get a 413. +@app.errorhandler(413) +def handle_large_file(e): + return (render_template('error.html', + error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + + +def type_to_heading(type_name): + """ + Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. + """ + + # Remove camel case + decamel = re.sub('([A-Z])', r' \1', type_name) + # Split + parts = decamel.split() + # Capitalize words and remove unwanted components + filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] + # Reassemble + return ' '.join(filtered) + +def name_to_label(field_name): + """ + Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. + """ + + return string.capwords(field_name.replace('_', ' ')) + +def generate_form(schema): + """ + Linearize the schema and send a bunch of dicts. + Each dict either has a 'heading' (in which case we put a heading for a + form section in the template) or an 'id', 'label', 'type', and 'required' + (in which case we make a form field in the template). + """ + + # Get the list of form components, one of which is the root + components = schema.get('$graph', []) + + # Find the root + root_name = None + # And also index components by type name + by_name = {} + for component in components: + # Get the name of each + component_name = component.get('name', None) + if isinstance(component_name, str): + # And remember how to map back form it + by_name[component_name] = component + if component.get('documentRoot', False): + # Find whichever one is the root + root_name = component_name + + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): + """ + Do a traversal of the component tree. + Yield a bunch of form item dicts, in order. + Form IDs are .-separated keypaths for where they are in the structure. + parent_keys is the path of field names to where we are in the root record's document tree. + """ + + if len(parent_keys) > 1: + # First make a heading, if we aren't the very root of the form + yield {'heading': type_to_heading(type_name)} + + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): + # For each field + + ref_url = None + if not isinstance(field_type, str): + # If the type isn't a string + # See if it has a more info/what goes here URL + ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) + # Grab out its type field + field_type = field_type.get('type', '') + + # Decide if the field is optional (type ends in ?) + optional = False + if len(field_type) > 0 and field_type[-1] == '?': + # It's optional + optional = True + # Drop the ? + field_type = field_type[:-1] + + if field_type in by_name: + # This is a subrecord. We need to recurse + for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): + yield item + else: + # We know how to make a string input + record = {} + record['id'] = '.'.join(parent_keys + [field_name]) + record['label'] = name_to_label(field_name) + record['required'] = not optional and not subtree_optional + if ref_url: + record['ref_url'] = ref_url + if field_type == 'string': + record['type'] = 'text' # HTML input type + elif field_type == 'int': + record['type'] = 'number' + else: + raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) + yield record + + return list(walk_fields(root_name)) + +# At startup, we need to load the current metadata schema so we can make a form for it +METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) +FORM_ITEMS = generate_form(METADATA_SCHEMA) + +@app.route('/') +def send_form(): + """ + Send the file upload form/front page. + """ + + return render_template('form.html', fields=FORM_ITEMS) + +class FileTooBigError(RuntimeError): + """ + Raised when the user gives a file that is too large. + """ + pass + +def copy_with_limit(in_file, out_file, limit=1024*1024): + """ + Copy a file stream, and raise FileTooBigError if the file is too big. + """ + + bytes_used = 0 + buf_size = 65536 + + buf = in_file.read(buf_size) + bytes_used += len(buf) + while buf: + if bytes_used > limit: + raise FileTooBigError('Hit file length limit') + out_file.write(buf) + buf = in_file.read(buf_size) + bytes_used += len(buf) + +def parse_input(input_string, html_type): + """ + Parse an input from the given HTML input type into a useful Python type. + + Raise ValueError if something does not parse. + Raise NotImplementedError if we forgot to implement a type. + """ + + if html_type == 'text': + return input_string + elif html_type == 'number': + return int(input_string) + else: + raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) + +@app.route('/submit', methods=['POST']) +def receive_files(): + """ + Receive the uploaded files. + """ + + # We're going to work in one directory per request + dest_dir = tempfile.mkdtemp() + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') + try: + if 'fasta' not in request.files: + return (render_template('error.html', + error_message="You did not include a FASTA file."), 403) + try: + with open(fasta_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('fasta').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + + if request.form.get('metadata_type', None) == 'upload': + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + try: + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + elif request.form.get('metadata_type', None) == 'fill': + # Build a metadata dict + metadata = {} + + for item in FORM_ITEMS: + # Pull all the field values we wanted from the form + if 'heading' in item: + continue + + if item['id'] in request.form and len(request.form[item['id']]) > 0: + # We have this thing. Make a place in the dict tree for it. + parts = item['id'].split('.') + key = parts[-1] + # Remove leading 'metadata' + path = parts[1:-1] + dest_dict = metadata + for parent in path: + if parent not in dest_dict: + dest_dict[parent] = {} + dest_dict = dest_dict[parent] + + try: + # Now finally add the item + dest_dict[key] = parse_input(request.form[item['id']], item['type']) + except ValueError: + # We don't like that input + return (render_template('error.html', + error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) + elif item['required']: + return (render_template('error.html', + error_message="You omitted the required metadata item {}".format(item['id'])), 403) + + # Now serialize the file with all the items + with open(metadata_dest, 'w') as out_stream: + yaml.dump(metadata, out_stream) + else: + return (render_template('error.html', + error_message="You did not include metadata."), 403) + + # Try and upload files to Arvados using the sequence uploader CLI + result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode != 0: + # It didn't work. Complain. + error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) + return (render_template('error.html', error_message=error_message), 403) + else: + # It worked. Say so. + return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + finally: + shutil.rmtree(dest_dir) diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html new file mode 100644 index 0000000..c2ab0a4 --- /dev/null +++ b/bh20simplewebuploader/templates/error.html @@ -0,0 +1,19 @@ + + + + + + Upload Failed + + +

Upload Failed

+
+

+ Your upload has failed. {{error_message}} +

+

+ Click here to try again. +

+
+ + diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html new file mode 100644 index 0000000..4ad41e2 --- /dev/null +++ b/bh20simplewebuploader/templates/form.html @@ -0,0 +1,95 @@ + + + + + + Simple Web Uploader for Public SARS-CoV-2 Sequence Resource + + +

Simple Web Uploader for Public SARS-CoV-2 Sequence Resource

+
+

+ This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. +

+
+
+ +
+ +
+ + +
+ + +
+ + +
+ +
+
+ +
+ +
+
+
+ +
+
+ {% for record in fields %} + {% if 'heading' in record %} +

{{ record['heading'] }}

+ {% else %} + +
+ +
+ {% endif %} + {% endfor %} +
+
+ + +
+
+ Source · Made for COVID-19-BH20 + + + diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html new file mode 100644 index 0000000..1be7861 --- /dev/null +++ b/bh20simplewebuploader/templates/success.html @@ -0,0 +1,24 @@ + + + + + + Upload Successful + + +

Upload Successful

+
+

+ Your files have been uploaded. They should soon appear as part of the Public SARS-CoV-2 Sequence Resource. +

+

+ The upload log was: +

+
{{log}}
+
+

+ Click here to upload more files. +

+
+ + diff --git a/main.py b/main.py deleted file mode 100644 index b4e8681..0000000 --- a/main.py +++ /dev/null @@ -1,257 +0,0 @@ -import collections -import tempfile -import shutil -import subprocess -import os -import re -import string -import yaml -import urllib.request -from flask import Flask, request, redirect, send_file, send_from_directory, render_template - -app = Flask(__name__, static_url_path='/static', static_folder='static') - -# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. -# We will enforce the limit ourselves and set a higher safety limit here. -app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 - -# When a file is too big we get a 413. -@app.errorhandler(413) -def handle_large_file(e): - return (render_template('error.html', - error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) - - -def type_to_heading(type_name): - """ - Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. - """ - - # Remove camel case - decamel = re.sub('([A-Z])', r' \1', type_name) - # Split - parts = decamel.split() - # Capitalize words and remove unwanted components - filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] - # Reassemble - return ' '.join(filtered) - -def name_to_label(field_name): - """ - Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. - """ - - return string.capwords(field_name.replace('_', ' ')) - -def generate_form(schema): - """ - Linearize the schema and send a bunch of dicts. - Each dict either has a 'heading' (in which case we put a heading for a - form section in the template) or an 'id', 'label', 'type', and 'required' - (in which case we make a form field in the template). - """ - - # Get the list of form components, one of which is the root - components = schema.get('$graph', []) - - # Find the root - root_name = None - # And also index components by type name - by_name = {} - for component in components: - # Get the name of each - component_name = component.get('name', None) - if isinstance(component_name, str): - # And remember how to map back form it - by_name[component_name] = component - if component.get('documentRoot', False): - # Find whichever one is the root - root_name = component_name - - - def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): - """ - Do a traversal of the component tree. - Yield a bunch of form item dicts, in order. - Form IDs are .-separated keypaths for where they are in the structure. - parent_keys is the path of field names to where we are in the root record's document tree. - """ - - if len(parent_keys) > 1: - # First make a heading, if we aren't the very root of the form - yield {'heading': type_to_heading(type_name)} - - for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): - # For each field - - ref_url = None - if not isinstance(field_type, str): - # If the type isn't a string - # See if it has a more info/what goes here URL - ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) - # Grab out its type field - field_type = field_type.get('type', '') - - # Decide if the field is optional (type ends in ?) - optional = False - if len(field_type) > 0 and field_type[-1] == '?': - # It's optional - optional = True - # Drop the ? - field_type = field_type[:-1] - - if field_type in by_name: - # This is a subrecord. We need to recurse - for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): - yield item - else: - # We know how to make a string input - record = {} - record['id'] = '.'.join(parent_keys + [field_name]) - record['label'] = name_to_label(field_name) - record['required'] = not optional and not subtree_optional - if ref_url: - record['ref_url'] = ref_url - if field_type == 'string': - record['type'] = 'text' # HTML input type - elif field_type == 'int': - record['type'] = 'number' - else: - raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) - yield record - - return list(walk_fields(root_name)) - -# At startup, we need to load the current metadata schema so we can make a form for it -METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) -FORM_ITEMS = generate_form(METADATA_SCHEMA) - -@app.route('/') -def send_form(): - """ - Send the file upload form/front page. - """ - - return render_template('form.html', fields=FORM_ITEMS) - -class FileTooBigError(RuntimeError): - """ - Raised when the user gives a file that is too large. - """ - pass - -def copy_with_limit(in_file, out_file, limit=1024*1024): - """ - Copy a file stream, and raise FileTooBigError if the file is too big. - """ - - bytes_used = 0 - buf_size = 65536 - - buf = in_file.read(buf_size) - bytes_used += len(buf) - while buf: - if bytes_used > limit: - raise FileTooBigError('Hit file length limit') - out_file.write(buf) - buf = in_file.read(buf_size) - bytes_used += len(buf) - -def parse_input(input_string, html_type): - """ - Parse an input from the given HTML input type into a useful Python type. - - Raise ValueError if something does not parse. - Raise NotImplementedError if we forgot to implement a type. - """ - - if html_type == 'text': - return input_string - elif html_type == 'number': - return int(input_string) - else: - raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) - -@app.route('/submit', methods=['POST']) -def receive_files(): - """ - Receive the uploaded files. - """ - - # We're going to work in one directory per request - dest_dir = tempfile.mkdtemp() - fasta_dest = os.path.join(dest_dir, 'fasta.fa') - metadata_dest = os.path.join(dest_dir, 'metadata.json') - try: - if 'fasta' not in request.files: - return (render_template('error.html', - error_message="You did not include a FASTA file."), 403) - try: - with open(fasta_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('fasta').stream, out_stream) - except FileTooBigError as e: - # Delegate to the 413 error handler - return handle_large_file(e) - - if request.form.get('metadata_type', None) == 'upload': - if 'metadata' not in request.files: - return (render_template('error.html', - error_message="You did not include a metadata file."), 403) - try: - with open(metadata_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('metadata').stream, out_stream) - except FileTooBigError as e: - # Delegate to the 413 error handler - return handle_large_file(e) - elif request.form.get('metadata_type', None) == 'fill': - # Build a metadata dict - metadata = {} - - for item in FORM_ITEMS: - # Pull all the field values we wanted from the form - if 'heading' in item: - continue - - if item['id'] in request.form and len(request.form[item['id']]) > 0: - # We have this thing. Make a place in the dict tree for it. - parts = item['id'].split('.') - key = parts[-1] - # Remove leading 'metadata' - path = parts[1:-1] - dest_dict = metadata - for parent in path: - if parent not in dest_dict: - dest_dict[parent] = {} - dest_dict = dest_dict[parent] - - try: - # Now finally add the item - dest_dict[key] = parse_input(request.form[item['id']], item['type']) - except ValueError: - # We don't like that input - return (render_template('error.html', - error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) - elif item['required']: - return (render_template('error.html', - error_message="You omitted the required metadata item {}".format(item['id'])), 403) - - # Now serialize the file with all the items - with open(metadata_dest, 'w') as out_stream: - yaml.dump(metadata, out_stream) - else: - return (render_template('error.html', - error_message="You did not include metadata."), 403) - - # Try and upload files to Arvados using the sequence uploader CLI - result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - if result.returncode != 0: - # It didn't work. Complain. - error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) - return (render_template('error.html', error_message=error_message), 403) - else: - # It worked. Say so. - return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) - finally: - shutil.rmtree(dest_dir) diff --git a/templates/error.html b/templates/error.html deleted file mode 100644 index c2ab0a4..0000000 --- a/templates/error.html +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - Upload Failed - - -

Upload Failed

-
-

- Your upload has failed. {{error_message}} -

-

- Click here to try again. -

-
- - diff --git a/templates/form.html b/templates/form.html deleted file mode 100644 index 4ad41e2..0000000 --- a/templates/form.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - Simple Web Uploader for Public SARS-CoV-2 Sequence Resource - - -

Simple Web Uploader for Public SARS-CoV-2 Sequence Resource

-
-

- This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. -

-
-
- -
- -
- - -
- - -
- - -
- -
-
- -
- -
-
-
- -
-
- {% for record in fields %} - {% if 'heading' in record %} -

{{ record['heading'] }}

- {% else %} - -
- -
- {% endif %} - {% endfor %} -
-
- - -
-
- Source · Made for COVID-19-BH20 - - - diff --git a/templates/success.html b/templates/success.html deleted file mode 100644 index 1be7861..0000000 --- a/templates/success.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - Upload Successful - - -

Upload Successful

-
-

- Your files have been uploaded. They should soon appear as part of the Public SARS-CoV-2 Sequence Resource. -

-

- The upload log was: -

-
{{log}}
-
-

- Click here to upload more files. -

-
- - -- cgit v1.2.3 From 2cd6623aa0ddfe4e42b2d434e0523773bb3536ef Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 15:52:23 -0700 Subject: Copy over/combine top-level project components --- Dockerfile | 19 +++++++++++++++++++ README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 ++----- setup.py | 6 +++++- 4 files changed, 71 insertions(+), 6 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..43fa8f2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +# Dockerfile for containerizing the web interface +FROM python:3.6-jessie +WORKDIR /app + +RUN pip3 install gunicorn + +ADD LICENSE /app/ +ADD gittaggers.py /app/ +ADD setup.py /app/ +ADD README.md /app/ +ADD example /app/example +ADD bh20seqanalyzer /app/bh20simplewebuploader +ADD bh20sequploader /app/bh20sequploader +ADD bh20simplewebuploader /app/bh20simplewebuploader + +RUN pip3 install -e . + +ENV PORT 8080 +CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:8080", "bh20simplewebuploader.main:app"] diff --git a/README.md b/README.md index a6fe052..4667310 100644 --- a/README.md +++ b/README.md @@ -159,4 +159,49 @@ odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). +# Web Interface + +This project comes with a simple web server that lets you use the sequence uploader from a browser. It will work as long as you install the packager with the `web` extra. + +To run it locally: + +``` +virtualenv --python python3 venv +. venv/bin/activate +pip install -e .[web] +env FLASK_APP=bh20simplewebuploader/main.py flask run +``` + +Then visit [http://127.0.0.1:5000/](http://127.0.0.1:5000/). + +## Production + +For production deployment, you can use [gunicorn](https://flask.palletsprojects.com/en/1.1.x/deploying/wsgi-standalone/#gunicorn): + +``` +pip3 install gunicorn +gunicorn bh20simplewebuploader.main:app +``` + +This runs on [http://127.0.0.1:8000/](http://127.0.0.1:8000/) by default, but can be adjusted with various [gunicorn options](http://docs.gunicorn.org/en/latest/run.html#commonly-used-arguments) + +## GNU Guix + +To run the web uploader in a GNU Guix environment + +``` +guix environment guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run +``` + +The containerized version looks like + +``` +guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl +``` + +and + +``` +env FLASK_APP=bh20simplewebuploader/main.py flask run +``` diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 8b8fefe..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,7 +6,6 @@ import json import urllib.request import socket import getpass -from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -20,8 +19,6 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - qc_metadata(args.metadata.name) - col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -32,8 +29,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading metadata") - with col.open("metadata.yaml", "w") as f: + print("Reading JSONLD") + with col.open("metadata.jsonld", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/setup.py b/setup.py index 48c25aa..41ace7b 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ except ImportError: tagger = egg_info_cmd.egg_info install_requires = ["arvados-python-client", "schema-salad"] +web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -29,9 +30,12 @@ setup( author="Peter Amstutz", author_email="peter.amstutz@curii.com", license="Apache 2.0", - packages=["bh20sequploader", "bh20seqanalyzer"], + packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, + extras_require={ + 'web': web_requires + }, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], entry_points={ -- cgit v1.2.3 From 278ff0b42a49b861060eae0c7eb6112e9658fa4e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 16:08:23 -0700 Subject: Point to new repo for source --- bh20simplewebuploader/templates/form.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html index 4ad41e2..2934a7c 100644 --- a/bh20simplewebuploader/templates/form.html +++ b/bh20simplewebuploader/templates/form.html @@ -60,7 +60,7 @@
- Source · Made for COVID-19-BH20 + Source · Made for COVID-19-BH20