From afbc3ec99f638a2f8df96a8e952b5b9616dc99a8 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 13:31:49 -0400 Subject: Now moves collections into 'validated sequences' project Improve logging for seq service Fix uploader bug Runs workflow with all validated sequences. --- bh20seqanalyzer/main.py | 83 +++++++++++++++++++++++++++++++++++++++---------- bh20sequploader/main.py | 3 +- setup.py | 2 +- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 23e58e9..dae8eca 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -1,29 +1,70 @@ import argparse import arvados +import arvados.collection import time import subprocess import tempfile import json +import logging + +logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) +logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) + +def validate_upload(api, collection, validated_project): + col = arvados.collection.Collection(collection["uuid"]) + + # validate the collection here. Check metadata, etc. + valid = True + + if "sequence.fasta" not in col: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) + if "metadata.jsonld" not in col: + logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + valid = False + + dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], + ["portable_data_hash", "=", col.portable_data_hash()]]).execute() + if dup["items"]: + # This exact collection has been uploaded before. + valid = False + logging.warn("Upload '%s' is duplicate" % collection["name"]) + + if valid: + logging.info("Added '%s' to validated sequences" % collection["name"]) + # Move it to the "validated" project to be included in the next analysis + api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + else: + # It is invalid, delete it. + logging.warn("Deleting '%s'" % collection["name"]) + api.collections().delete(uuid=collection["uuid"]).execute() + + return valid + +def start_analysis(api, + analysis_project, + workflow_uuid, + validated_project): -def start_analysis(api, collection, analysis_project, workflow_uuid): project = api.groups().create(body={ "group_class": "project", - "name": "Analysis of %s" % collection["name"], + "name": "Pangenome analysis", "owner_uuid": analysis_project, }, ensure_unique_name=True).execute() + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + with tempfile.NamedTemporaryFile() as tmp: - inputobj = json.dumps({ - "sequence": { + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ "class": "File", - "location": "keep:%s/sequence.fasta" % collection["portable_data_hash"] - }, - "metadata": { - "class": "File", - "location": "keep:%s/metadata.jsonld" % collection["portable_data_hash"] - } - }, indent=2) - tmp.write(inputobj.encode('utf-8')) + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", "--submit", @@ -32,24 +73,32 @@ def start_analysis(api, collection, analysis_project, workflow_uuid): "--project-uuid=%s" % project["uuid"], "arvwf:%s" % workflow_uuid, tmp.name] - print("Running %s" % ' '.join(cmd)) + logging.info("Running %s" % ' '.join(cmd)) comp = subprocess.run(cmd, capture_output=True) if comp.returncode != 0: - print(comp.stderr.decode('utf-8')) - else: - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": project['uuid']}).execute() + logging.error(comp.stderr.decode('utf-8')) + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') args = parser.parse_args() api = arvados.api() + logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) + while True: new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() + at_least_one_new_valid_seq = False for c in new_collections["items"]: - start_analysis(api, c, args.analysis_project, args.workflow_uuid) + at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + + if at_least_one_new_valid_seq: + start_analysis(api, args.analysis_project, + args.workflow_uuid, + args.validated_project) time.sleep(10) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,4 +49,5 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 9e73ff0..0685d37 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import setuptools.command.egg_info as egg_info_cmd from setuptools import setup SETUP_DIR = os.path.dirname(__file__) -README = os.path.join(SETUP_DIR, "README.rst") +README = os.path.join(SETUP_DIR, "README.md") try: import gittaggers -- cgit v1.2.3 From 40df65dec296b81650987c8ee4f832b703ab8f74 Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 7 Apr 2020 19:51:49 +0200 Subject: adding dummy metadata qc to the project --- bh20sequploader/qc_metadata.py | 13 +++++++++++++ example/dummyschema.yaml | 16 ++++++++++++++++ example/metadata.json | 0 example/metadata.yaml | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 bh20sequploader/qc_metadata.py create mode 100644 example/dummyschema.yaml delete mode 100644 example/metadata.json create mode 100644 example/metadata.yaml diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..0632777 --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,13 @@ +import yamale + +## NOTE: this is just a DUMMY. Everything about this can and will change +def qc_metadata(metadatafile): + print("Start metadata validation...") + schema = yamale.make_schema('../example/dummyschema.yaml') + data = yamale.make_data(metadatafile) + # Validate data against the schema. Throws a ValueError if data is invalid. + yamale.validate(schema, data) + print("...complete!") + +#qc_metadata("../example/metadata.yaml") + diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml new file mode 100644 index 0000000..e428324 --- /dev/null +++ b/example/dummyschema.yaml @@ -0,0 +1,16 @@ +#sampleInformation: include('sampleInformation') +#InstituteInformation: include('InstituteInformation') +--- +sampleInformation: + location : str() + host : str() + sequenceTechnology: str() + assemblyMethod: str() + +InstituteInformation: + OriginatingLab: str() + SubmittingLab: str() + +VirusDetail: + VirusName: str() + AccessionId: str() diff --git a/example/metadata.json b/example/metadata.json deleted file mode 100644 index e69de29..0000000 diff --git a/example/metadata.yaml b/example/metadata.yaml new file mode 100644 index 0000000..587d0be --- /dev/null +++ b/example/metadata.yaml @@ -0,0 +1,17 @@ +sampleInformation: + location: "USA" + host : "Homo Sapiens" + sequenceTechnology: "Sanger" + assemblyMethod: "CLC Genomics" + +InstituteInformation: + OriginatingLab: "Erik's kitchen" + SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" + +SubmitterInformation: + Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" + submissionDate: "04-04-2020" + +VirusDetail: + VirusName: "hCoV-19/USA/identifer/2020" + AccessionId: "EPI_ISL_Random" -- cgit v1.2.3 From 102a3663123292375440eeb04276d22a5b4645e0 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:27:10 -0400 Subject: Copy recent results to a set destination --- bh20seqanalyzer/main.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index dae8eca..2db97f6 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -11,7 +11,7 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, latest_result_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -79,12 +79,31 @@ def start_analysis(api, logging.error(comp.stderr.decode('utf-8')) +def copy_most_recent_result(api, analysis_project, latest_result_uuid): + most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], + order="created_at desc").execute() + for m in most_recent_analysis["items"]: + cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + wf = cr["items"][0] + src = api.collections().get(uuid=wf["output_uuid"]).execute() + dst = api.collections().get(uuid=latest_result_uuid).execute() + if src["portable_data_hash"] != dst["portable_data_hash"]: + logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) + api.collections().update(uuid=latest_result_uuid, + body={"manifest_text": src["manifest_text"], + "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + break + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -101,4 +120,7 @@ def main(): start_analysis(api, args.analysis_project, args.workflow_uuid, args.validated_project) + + copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + time.sleep(10) -- cgit v1.2.3 From 4215a82af730ff05b8fe98e226b759413cdf95f7 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:37:19 -0400 Subject: limit 1 --- bh20seqanalyzer/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2db97f6..2513ea3 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -81,7 +81,7 @@ def start_analysis(api, def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], - order="created_at desc").execute() + order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], ["requesting_container_uuid", "=", None]]).execute() -- cgit v1.2.3 From 37652786cb6605a4862e820f2ba85f2fe818952f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 7 Apr 2020 11:58:33 -0700 Subject: Make README more didactic --- README.md | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index ec9afb1..a6fe052 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,162 @@ # Sequence uploader -This repository provides a sequence uploader for the +This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -# Run +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). -Run the uploader with a FASTA file and accompanying metadata: +# Installation - python3 bh20sequploader/main.py example/sequence.fasta example/metadata.json +There are several ways to install the uploader. The most portable is with a [virtualenv](#installation-with-virtualenv). -# Add a workflow +## Installation with `virtualenv` -get your SARS-CoV-2 sequences from GenBank in seqs.fa +1. **Prepare your system.** You need to make sure you have Python, and the ability to install modules such as `pycurl` and `pyopenssl`. On Ubuntu 18.04, you can run: ```sh -minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf -seqwish -s seqs.fa -p seqs.paf -g seqs.gfa -odgi build -g seqs.gfa -s -o seqs.odgi -odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev ``` -from https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes +2. **Create and enter your virtualenv.** Go to some memorable directory and make and enter a virtualenv: -# Installation +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +3. **Install the tool.** Once in your virtualenv, install this project: + +```sh +pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +**Make sure you are in your virtualenv whenever you run the tool!** If you ever can't run the tool, and your prompt doesn't say `(venv)`, try going to the directory where you put the virtualenv and running `. venv/bin/activate`. It only works for the current terminal window; you will need to run it again if you open a new terminal. + +## Installation with `pip3 --user` + +If you don't want to have to enter a virtualenv every time you use the uploader, you can use the `--user` feature of `pip3` to install the tool for your user. + +1. **Prepare your system.** Just as for the `virtualenv` method, you need to install some dependencies. On Ubuntu 18.04, you can run: + +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Install the tool.** You can run: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +3. **Make sure the tool is on your `PATH`.** THe `pip3` command will install the uploader in `.local/bin` inside your home directory. Your shell may not know to look for commands there by default. To fix this for the terminal you currently have open, run: + +```sh +export PATH=$PATH:$HOME/.local/bin +``` + +To make this change permanent, assuming your shell is Bash, run: + +```sh +echo 'export PATH=$PATH:$HOME/.local/bin' >>~/.bashrc +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. -This tool requires the arvados Python module which can be installed -using .deb or .rpm packages through -https://doc.arvados.org/v2.0/sdk/python/sdk-python.html. The actual -code lives [here](https://github.com/arvados/arvados/tree/master/sdk/python) and -suggests a local install using +## Installation from Source for Development - apt-get install libcurl4-openssl-dev libssl1.0-dev - pip3 install --user arvados-python-client +If you plan to contribute to the project, you may want to install an editable copy from source. With this method, changes to the source code are automatically reflected in the installed copy of the tool. -Next update +1. **Prepare your system.** On Ubuntu 18.04, you can run: - export PATH=$PATH:$HOME/.local/bin +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Clone and enter the repository.** You can run: + +```sh +git clone https://github.com/arvados/bh20-seq-resource.git +cd bh20-seq-resource +``` + +3. **Create and enter a virtualenv.** Go to some memorable directory and make and enter a virtualenv: + +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +4. **Install the checked-out repository in editable mode.** Once in your virtualenv, install with this special pip command: + +```sh +pip3 install -e . +``` + +5. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +## Installation with GNU Guix -## Install with GNU Guix +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. -Set up a container: +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +# Usage + +Run the uploader with a FASTA file and accompanying metadata file in [JSON-LD format](https://json-ld.org/): + +```sh +bh20-seq-uploader example/sequence.fasta example/metadata.json +``` + +## Workflow for Generating a Pangenome + +All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. + +Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: + +```sh +minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf +seqwish -s seqs.fa -p seqs.paf -g seqs.gfa +odgi build -g seqs.gfa -s -o seqs.odgi +odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +``` - ~/opt/guix/bin/guix environment -C guix --ad-hoc python openssl python-pycurl nss-certs - pip3 install --user arvados-python-client +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). -Pip installed the following modules - arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 -- cgit v1.2.3 From 07bc4c65535437b8e9e0744f08da8cea541d0116 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 15:28:42 -0400 Subject: Add metadata validation with schema-salad --- bh20seqanalyzer/main.py | 11 ++++++++--- bh20sequploader/bh20seq-schema.yml | 36 ++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 +++++-- bh20sequploader/qc_metadata.py | 26 +++++++++++++++++--------- example/dummyschema.yaml | 16 ---------------- setup.py | 3 ++- 6 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 bh20sequploader/bh20seq-schema.yml delete mode 100644 example/dummyschema.yaml diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2513ea3..78e32c9 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -6,12 +6,14 @@ import subprocess import tempfile import json import logging +import ruamel.yaml +from bh20sequploader.qc_metadata import qc_metadata logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project, latest_result_uuid): +def validate_upload(api, collection, validated_project): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -20,9 +22,12 @@ def validate_upload(api, collection, validated_project, latest_result_uuid): if "sequence.fasta" not in col: valid = False logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) - if "metadata.jsonld" not in col: - logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + if "metadata.yaml" not in col: + logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False + else: + metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) + valid = qc_metadata(metadata_content) and valid dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..6e0973a --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,36 @@ +$graph: + +- name: sampleInformationSchema + type: record + fields: + location: string + host: string + sequenceTechnology: string + assemblyMethod: string + +- name: InstituteInformationSchema + type: record + fields: + OriginatingLab: string + SubmittingLab: string + +- name: SubmitterInformationSchema + type: record + fields: + Submitter: string + submissionDate: string + +- name: VirusDetailSchema + type: record + fields: + VirusName: string + AccessionId: string + +- name: MainSchema + type: record + documentRoot: true + fields: + sampleInformation: sampleInformationSchema + InstituteInformation: InstituteInformationSchema + SubmitterInformation: SubmitterInformationSchema + VirusDetail: VirusDetailSchema diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index d3ebc0c..8b8fefe 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,6 +20,8 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + qc_metadata(args.metadata.name) + col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -29,8 +32,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 0632777..78b31b2 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,13 +1,21 @@ -import yamale +import schema_salad.schema +import logging +import pkg_resources -## NOTE: this is just a DUMMY. Everything about this can and will change def qc_metadata(metadatafile): - print("Start metadata validation...") - schema = yamale.make_schema('../example/dummyschema.yaml') - data = yamale.make_data(metadatafile) - # Validate data against the schema. Throws a ValueError if data is invalid. - yamale.validate(schema, data) - print("...complete!") + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) -#qc_metadata("../example/metadata.yaml") + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except: + return False diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml deleted file mode 100644 index e428324..0000000 --- a/example/dummyschema.yaml +++ /dev/null @@ -1,16 +0,0 @@ -#sampleInformation: include('sampleInformation') -#InstituteInformation: include('InstituteInformation') ---- -sampleInformation: - location : str() - host : str() - sequenceTechnology: str() - assemblyMethod: str() - -InstituteInformation: - OriginatingLab: str() - SubmittingLab: str() - -VirusDetail: - VirusName: str() - AccessionId: str() diff --git a/setup.py b/setup.py index 0685d37..48c25aa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client"] +install_requires = ["arvados-python-client", "schema-salad"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -30,6 +30,7 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer"], + package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], -- cgit v1.2.3 From 14ff178ed7f77a996f47e2115e2a1429f6b69356 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 12:12:49 -0700 Subject: Spell correctly --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6fe052..1448f4c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [upload your data](#usage). # Installation -- cgit v1.2.3 From 414c308b8860d1b20481a2ec3b2f6381e4f6061b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 14:11:39 -0700 Subject: Initial commit of working frontend --- __pycache__/main.cpython-36.pyc | Bin 0 -> 2716 bytes main.py | 98 ++++++++++++++++++++++++++++++++++++++++ pages/index.html | 28 ++++++++++++ templates/error.html | 19 ++++++++ templates/success.html | 24 ++++++++++ 5 files changed, 169 insertions(+) create mode 100644 __pycache__/main.cpython-36.pyc create mode 100644 main.py create mode 100644 pages/index.html create mode 100644 templates/error.html create mode 100644 templates/success.html diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc new file mode 100644 index 0000000..250c562 Binary files /dev/null and b/__pycache__/main.cpython-36.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..630669c --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +import tempfile +import shutil +import subprocess +import os +from flask import Flask, request, redirect, send_file, send_from_directory, render_template + +app = Flask(__name__, static_url_path='/static', static_folder='static') + +# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. +# We will enforce the limit ourselves and set a higher safety limit here. +app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 + +# When a file is too big we get a 413. +@app.errorhandler(413) +def handle_large_file(e): + return (render_template('error.html', + error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + +@app.route('/') +def send_form(): + """ + Send the file upload form/front page. + """ + return send_from_directory('pages', 'index.html') + +class FileTooBigError(RuntimeError): + """ + Raised when the user gives a file that is too large. + """ + pass + +def copy_with_limit(in_file, out_file, limit=1024*1024): + """ + Copy a file stream, and raise FileTooBigError if the file is too big. + """ + + bytes_used = 0 + buf_size = 65536 + + buf = in_file.read(buf_size) + bytes_used += len(buf) + while buf: + if bytes_used > limit: + raise FileTooBigError('Hit file length limit') + out_file.write(buf) + buf = in_file.read(buf_size) + bytes_used += len(buf) + + +@app.route('/submit', methods=['POST']) +def recieve_files(): + """ + Recieve the uploaded files. + """ + + # We're going to work in one directory per request + dest_dir = tempfile.mkdtemp() + try: + + print(request) + print(request.files) + + if 'fasta' not in request.files: + return (render_template('error.html', + error_message="You did not include a FASTA file."), 403) + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') + + try: + with open(fasta_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('fasta').stream, out_stream) + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + + # Try and upload files to Arvados + result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode != 0: + # It didn't work. Complain. + error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) + return (render_template('error.html', error_message=error_message), 403) + else: + # It worked. Say so. + return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + finally: + shutil.rmtree(dest_dir) + + + + diff --git a/pages/index.html b/pages/index.html new file mode 100644 index 0000000..2269791 --- /dev/null +++ b/pages/index.html @@ -0,0 +1,28 @@ + + +
+ + ++ This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the Public SARS-CoV-2 Sequence Resource. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. +
++ Your upload has failed. {{error_message}} +
+ ++ Your files have been uploaded. They should soon appear as part of the Public SARS-CoV-2 Sequence Resource. +
++ The upload log was: +
+{{log}}+
+ Click here to upload more files. +
+