From afbc3ec99f638a2f8df96a8e952b5b9616dc99a8 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 13:31:49 -0400 Subject: Now moves collections into 'validated sequences' project Improve logging for seq service Fix uploader bug Runs workflow with all validated sequences. --- bh20seqanalyzer/main.py | 83 +++++++++++++++++++++++++++++++++++++++---------- bh20sequploader/main.py | 3 +- setup.py | 2 +- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 23e58e9..dae8eca 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -1,29 +1,70 @@ import argparse import arvados +import arvados.collection import time import subprocess import tempfile import json +import logging + +logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) +logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) + +def validate_upload(api, collection, validated_project): + col = arvados.collection.Collection(collection["uuid"]) + + # validate the collection here. Check metadata, etc. + valid = True + + if "sequence.fasta" not in col: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) + if "metadata.jsonld" not in col: + logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + valid = False + + dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], + ["portable_data_hash", "=", col.portable_data_hash()]]).execute() + if dup["items"]: + # This exact collection has been uploaded before. + valid = False + logging.warn("Upload '%s' is duplicate" % collection["name"]) + + if valid: + logging.info("Added '%s' to validated sequences" % collection["name"]) + # Move it to the "validated" project to be included in the next analysis + api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + else: + # It is invalid, delete it. + logging.warn("Deleting '%s'" % collection["name"]) + api.collections().delete(uuid=collection["uuid"]).execute() + + return valid + +def start_analysis(api, + analysis_project, + workflow_uuid, + validated_project): -def start_analysis(api, collection, analysis_project, workflow_uuid): project = api.groups().create(body={ "group_class": "project", - "name": "Analysis of %s" % collection["name"], + "name": "Pangenome analysis", "owner_uuid": analysis_project, }, ensure_unique_name=True).execute() + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + with tempfile.NamedTemporaryFile() as tmp: - inputobj = json.dumps({ - "sequence": { + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ "class": "File", - "location": "keep:%s/sequence.fasta" % collection["portable_data_hash"] - }, - "metadata": { - "class": "File", - "location": "keep:%s/metadata.jsonld" % collection["portable_data_hash"] - } - }, indent=2) - tmp.write(inputobj.encode('utf-8')) + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", "--submit", @@ -32,24 +73,32 @@ def start_analysis(api, collection, analysis_project, workflow_uuid): "--project-uuid=%s" % project["uuid"], "arvwf:%s" % workflow_uuid, tmp.name] - print("Running %s" % ' '.join(cmd)) + logging.info("Running %s" % ' '.join(cmd)) comp = subprocess.run(cmd, capture_output=True) if comp.returncode != 0: - print(comp.stderr.decode('utf-8')) - else: - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": project['uuid']}).execute() + logging.error(comp.stderr.decode('utf-8')) + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') args = parser.parse_args() api = arvados.api() + logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) + while True: new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() + at_least_one_new_valid_seq = False for c in new_collections["items"]: - start_analysis(api, c, args.analysis_project, args.workflow_uuid) + at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + + if at_least_one_new_valid_seq: + start_analysis(api, args.analysis_project, + args.workflow_uuid, + args.validated_project) time.sleep(10) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,4 +49,5 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 9e73ff0..0685d37 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import setuptools.command.egg_info as egg_info_cmd from setuptools import setup SETUP_DIR = os.path.dirname(__file__) -README = os.path.join(SETUP_DIR, "README.rst") +README = os.path.join(SETUP_DIR, "README.md") try: import gittaggers -- cgit v1.2.3 From 40df65dec296b81650987c8ee4f832b703ab8f74 Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 7 Apr 2020 19:51:49 +0200 Subject: adding dummy metadata qc to the project --- bh20sequploader/qc_metadata.py | 13 +++++++++++++ example/dummyschema.yaml | 16 ++++++++++++++++ example/metadata.json | 0 example/metadata.yaml | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 bh20sequploader/qc_metadata.py create mode 100644 example/dummyschema.yaml delete mode 100644 example/metadata.json create mode 100644 example/metadata.yaml diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..0632777 --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,13 @@ +import yamale + +## NOTE: this is just a DUMMY. Everything about this can and will change +def qc_metadata(metadatafile): + print("Start metadata validation...") + schema = yamale.make_schema('../example/dummyschema.yaml') + data = yamale.make_data(metadatafile) + # Validate data against the schema. Throws a ValueError if data is invalid. + yamale.validate(schema, data) + print("...complete!") + +#qc_metadata("../example/metadata.yaml") + diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml new file mode 100644 index 0000000..e428324 --- /dev/null +++ b/example/dummyschema.yaml @@ -0,0 +1,16 @@ +#sampleInformation: include('sampleInformation') +#InstituteInformation: include('InstituteInformation') +--- +sampleInformation: + location : str() + host : str() + sequenceTechnology: str() + assemblyMethod: str() + +InstituteInformation: + OriginatingLab: str() + SubmittingLab: str() + +VirusDetail: + VirusName: str() + AccessionId: str() diff --git a/example/metadata.json b/example/metadata.json deleted file mode 100644 index e69de29..0000000 diff --git a/example/metadata.yaml b/example/metadata.yaml new file mode 100644 index 0000000..587d0be --- /dev/null +++ b/example/metadata.yaml @@ -0,0 +1,17 @@ +sampleInformation: + location: "USA" + host : "Homo Sapiens" + sequenceTechnology: "Sanger" + assemblyMethod: "CLC Genomics" + +InstituteInformation: + OriginatingLab: "Erik's kitchen" + SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" + +SubmitterInformation: + Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" + submissionDate: "04-04-2020" + +VirusDetail: + VirusName: "hCoV-19/USA/identifer/2020" + AccessionId: "EPI_ISL_Random" -- cgit v1.2.3 From 102a3663123292375440eeb04276d22a5b4645e0 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:27:10 -0400 Subject: Copy recent results to a set destination --- bh20seqanalyzer/main.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index dae8eca..2db97f6 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -11,7 +11,7 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, latest_result_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -79,12 +79,31 @@ def start_analysis(api, logging.error(comp.stderr.decode('utf-8')) +def copy_most_recent_result(api, analysis_project, latest_result_uuid): + most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], + order="created_at desc").execute() + for m in most_recent_analysis["items"]: + cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + wf = cr["items"][0] + src = api.collections().get(uuid=wf["output_uuid"]).execute() + dst = api.collections().get(uuid=latest_result_uuid).execute() + if src["portable_data_hash"] != dst["portable_data_hash"]: + logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) + api.collections().update(uuid=latest_result_uuid, + body={"manifest_text": src["manifest_text"], + "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + break + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -101,4 +120,7 @@ def main(): start_analysis(api, args.analysis_project, args.workflow_uuid, args.validated_project) + + copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + time.sleep(10) -- cgit v1.2.3 From 4215a82af730ff05b8fe98e226b759413cdf95f7 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:37:19 -0400 Subject: limit 1 --- bh20seqanalyzer/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2db97f6..2513ea3 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -81,7 +81,7 @@ def start_analysis(api, def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], - order="created_at desc").execute() + order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], ["requesting_container_uuid", "=", None]]).execute() -- cgit v1.2.3 From 37652786cb6605a4862e820f2ba85f2fe818952f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 7 Apr 2020 11:58:33 -0700 Subject: Make README more didactic --- README.md | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index ec9afb1..a6fe052 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,162 @@ # Sequence uploader -This repository provides a sequence uploader for the +This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -# Run +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). -Run the uploader with a FASTA file and accompanying metadata: +# Installation - python3 bh20sequploader/main.py example/sequence.fasta example/metadata.json +There are several ways to install the uploader. The most portable is with a [virtualenv](#installation-with-virtualenv). -# Add a workflow +## Installation with `virtualenv` -get your SARS-CoV-2 sequences from GenBank in seqs.fa +1. **Prepare your system.** You need to make sure you have Python, and the ability to install modules such as `pycurl` and `pyopenssl`. On Ubuntu 18.04, you can run: ```sh -minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf -seqwish -s seqs.fa -p seqs.paf -g seqs.gfa -odgi build -g seqs.gfa -s -o seqs.odgi -odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev ``` -from https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes +2. **Create and enter your virtualenv.** Go to some memorable directory and make and enter a virtualenv: -# Installation +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +3. **Install the tool.** Once in your virtualenv, install this project: + +```sh +pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +**Make sure you are in your virtualenv whenever you run the tool!** If you ever can't run the tool, and your prompt doesn't say `(venv)`, try going to the directory where you put the virtualenv and running `. venv/bin/activate`. It only works for the current terminal window; you will need to run it again if you open a new terminal. + +## Installation with `pip3 --user` + +If you don't want to have to enter a virtualenv every time you use the uploader, you can use the `--user` feature of `pip3` to install the tool for your user. + +1. **Prepare your system.** Just as for the `virtualenv` method, you need to install some dependencies. On Ubuntu 18.04, you can run: + +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Install the tool.** You can run: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +3. **Make sure the tool is on your `PATH`.** THe `pip3` command will install the uploader in `.local/bin` inside your home directory. Your shell may not know to look for commands there by default. To fix this for the terminal you currently have open, run: + +```sh +export PATH=$PATH:$HOME/.local/bin +``` + +To make this change permanent, assuming your shell is Bash, run: + +```sh +echo 'export PATH=$PATH:$HOME/.local/bin' >>~/.bashrc +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. -This tool requires the arvados Python module which can be installed -using .deb or .rpm packages through -https://doc.arvados.org/v2.0/sdk/python/sdk-python.html. The actual -code lives [here](https://github.com/arvados/arvados/tree/master/sdk/python) and -suggests a local install using +## Installation from Source for Development - apt-get install libcurl4-openssl-dev libssl1.0-dev - pip3 install --user arvados-python-client +If you plan to contribute to the project, you may want to install an editable copy from source. With this method, changes to the source code are automatically reflected in the installed copy of the tool. -Next update +1. **Prepare your system.** On Ubuntu 18.04, you can run: - export PATH=$PATH:$HOME/.local/bin +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Clone and enter the repository.** You can run: + +```sh +git clone https://github.com/arvados/bh20-seq-resource.git +cd bh20-seq-resource +``` + +3. **Create and enter a virtualenv.** Go to some memorable directory and make and enter a virtualenv: + +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +4. **Install the checked-out repository in editable mode.** Once in your virtualenv, install with this special pip command: + +```sh +pip3 install -e . +``` + +5. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +## Installation with GNU Guix -## Install with GNU Guix +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. -Set up a container: +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +# Usage + +Run the uploader with a FASTA file and accompanying metadata file in [JSON-LD format](https://json-ld.org/): + +```sh +bh20-seq-uploader example/sequence.fasta example/metadata.json +``` + +## Workflow for Generating a Pangenome + +All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. + +Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: + +```sh +minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf +seqwish -s seqs.fa -p seqs.paf -g seqs.gfa +odgi build -g seqs.gfa -s -o seqs.odgi +odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +``` - ~/opt/guix/bin/guix environment -C guix --ad-hoc python openssl python-pycurl nss-certs - pip3 install --user arvados-python-client +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). -Pip installed the following modules - arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 -- cgit v1.2.3 From 07bc4c65535437b8e9e0744f08da8cea541d0116 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 15:28:42 -0400 Subject: Add metadata validation with schema-salad --- bh20seqanalyzer/main.py | 11 ++++++++--- bh20sequploader/bh20seq-schema.yml | 36 ++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 +++++-- bh20sequploader/qc_metadata.py | 26 +++++++++++++++++--------- example/dummyschema.yaml | 16 ---------------- setup.py | 3 ++- 6 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 bh20sequploader/bh20seq-schema.yml delete mode 100644 example/dummyschema.yaml diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2513ea3..78e32c9 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -6,12 +6,14 @@ import subprocess import tempfile import json import logging +import ruamel.yaml +from bh20sequploader.qc_metadata import qc_metadata logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project, latest_result_uuid): +def validate_upload(api, collection, validated_project): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -20,9 +22,12 @@ def validate_upload(api, collection, validated_project, latest_result_uuid): if "sequence.fasta" not in col: valid = False logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) - if "metadata.jsonld" not in col: - logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + if "metadata.yaml" not in col: + logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False + else: + metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) + valid = qc_metadata(metadata_content) and valid dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..6e0973a --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,36 @@ +$graph: + +- name: sampleInformationSchema + type: record + fields: + location: string + host: string + sequenceTechnology: string + assemblyMethod: string + +- name: InstituteInformationSchema + type: record + fields: + OriginatingLab: string + SubmittingLab: string + +- name: SubmitterInformationSchema + type: record + fields: + Submitter: string + submissionDate: string + +- name: VirusDetailSchema + type: record + fields: + VirusName: string + AccessionId: string + +- name: MainSchema + type: record + documentRoot: true + fields: + sampleInformation: sampleInformationSchema + InstituteInformation: InstituteInformationSchema + SubmitterInformation: SubmitterInformationSchema + VirusDetail: VirusDetailSchema diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index d3ebc0c..8b8fefe 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,6 +20,8 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + qc_metadata(args.metadata.name) + col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -29,8 +32,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 0632777..78b31b2 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,13 +1,21 @@ -import yamale +import schema_salad.schema +import logging +import pkg_resources -## NOTE: this is just a DUMMY. Everything about this can and will change def qc_metadata(metadatafile): - print("Start metadata validation...") - schema = yamale.make_schema('../example/dummyschema.yaml') - data = yamale.make_data(metadatafile) - # Validate data against the schema. Throws a ValueError if data is invalid. - yamale.validate(schema, data) - print("...complete!") + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) -#qc_metadata("../example/metadata.yaml") + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except: + return False diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml deleted file mode 100644 index e428324..0000000 --- a/example/dummyschema.yaml +++ /dev/null @@ -1,16 +0,0 @@ -#sampleInformation: include('sampleInformation') -#InstituteInformation: include('InstituteInformation') ---- -sampleInformation: - location : str() - host : str() - sequenceTechnology: str() - assemblyMethod: str() - -InstituteInformation: - OriginatingLab: str() - SubmittingLab: str() - -VirusDetail: - VirusName: str() - AccessionId: str() diff --git a/setup.py b/setup.py index 0685d37..48c25aa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client"] +install_requires = ["arvados-python-client", "schema-salad"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -30,6 +30,7 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer"], + package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], -- cgit v1.2.3 From 9458ed33da08c787c4bb20af7b4108c93334b351 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 8 Apr 2020 17:41:19 -0400 Subject: Fastq now runs through fastq2fasta pipeline then gets added to pangenome analysis. --- bh20seqanalyzer/main.py | 141 ++++++++++++++++++++++++++++++----------- bh20sequploader/main.py | 14 +++- bh20sequploader/qc_metadata.py | 6 +- 3 files changed, 120 insertions(+), 41 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 78e32c9..1a8965b 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -13,21 +13,30 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, + fastq_project, fastq_workflow_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. valid = True - if "sequence.fasta" not in col: - valid = False - logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) if "metadata.yaml" not in col: logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) - valid = qc_metadata(metadata_content) and valid + #valid = qc_metadata(metadata_content) and valid + if not valid: + logging.warn("Failed metadata qc") + + if valid: + if "sequence.fasta" not in col: + if "reads.fastq" in col: + start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) + return False + else: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() @@ -39,7 +48,9 @@ def validate_upload(api, collection, validated_project): if valid: logging.info("Added '%s' to validated sequences" % collection["name"]) # Move it to the "validated" project to be included in the next analysis - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + api.collections().update(uuid=collection["uuid"], body={ + "owner_uuid": validated_project, + "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute() else: # It is invalid, delete it. logging.warn("Deleting '%s'" % collection["name"]) @@ -47,28 +58,15 @@ def validate_upload(api, collection, validated_project): return valid -def start_analysis(api, - analysis_project, - workflow_uuid, - validated_project): +def run_workflow(api, parent_project, workflow_uuid, name, inputobj): project = api.groups().create(body={ "group_class": "project", - "name": "Pangenome analysis", - "owner_uuid": analysis_project, + "name": name, + "owner_uuid": parent_project, }, ensure_unique_name=True).execute() - validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) - with tempfile.NamedTemporaryFile() as tmp: - inputobj = { - "inputReads": [] - } - for v in validated: - inputobj["inputReads"].append({ - "class": "File", - "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] - }) tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", @@ -83,32 +81,95 @@ def start_analysis(api, if comp.returncode != 0: logging.error(comp.stderr.decode('utf-8')) + return project + + +def start_fastq_to_fasta(api, collection, + analysis_project, + fastq_workflow_uuid): + newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", { + "fastq_forward": { + "class": "File", + "location": "keep:%s/reads.fastq" % collection["portable_data_hash"] + }, + "metadata": { + "class": "File", + "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"] + }, + "ref_fasta": { + "class": "File", + "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta" + } + }) + api.collections().update(uuid=collection["uuid"], + body={"owner_uuid": newproject["uuid"]}).execute() + +def start_pangenome_analysis(api, + analysis_project, + pangenome_workflow_uuid, + validated_project): + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ + "class": "File", + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) + + +def get_workflow_output_from_project(api, uuid): + cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + return cr["items"][0] + else: + return None + def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: - cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], - ["requesting_container_uuid", "=", None]]).execute() - if cr["items"] and cr["items"][0]["output_uuid"]: - wf = cr["items"][0] + wf = get_workflow_output_from_project(api, m["uuid"]) + if wf: src = api.collections().get(uuid=wf["output_uuid"]).execute() dst = api.collections().get(uuid=latest_result_uuid).execute() if src["portable_data_hash"] != dst["portable_data_hash"]: logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) api.collections().update(uuid=latest_result_uuid, body={"manifest_text": src["manifest_text"], - "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute() break +def move_fastq_to_fasta_results(api, analysis_project, uploader_project): + projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project], + ["properties.moved_output", "!=", True]], + order="created_at desc",).execute() + for p in projects["items"]: + wf = get_workflow_output_from_project(api, p["uuid"]) + if wf: + logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"]) + api.collections().update(uuid=wf["output_uuid"], + body={"owner_uuid": uploader_project}).execute() + p["properties"]["moved_output"] = True + api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute() + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') - parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--pangenome-analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--fastq-project', type=str, default='lugli-j7d0g-xcjxp4oox2u1w8u', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') - parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') - parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') + + parser.add_argument('--pangenome-workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--fastq-workflow-uuid', type=str, default='lugli-7fd4e-2zp9q4jo5xpif9y', help='') + + parser.add_argument('--latest-result-collection', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -116,16 +177,24 @@ def main(): logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) while True: + move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project) + new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() at_least_one_new_valid_seq = False for c in new_collections["items"]: - at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + at_least_one_new_valid_seq = validate_upload(api, c, + args.validated_project, + args.fastq_project, + args.fastq_workflow_uuid) or at_least_one_new_valid_seq if at_least_one_new_valid_seq: - start_analysis(api, args.analysis_project, - args.workflow_uuid, - args.validated_project) + start_pangenome_analysis(api, + args.pangenome_analysis_project, + args.pangenome_workflow_uuid, + args.validated_project) - copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + copy_most_recent_result(api, + args.pangenome_analysis_project, + args.latest_result_collection) - time.sleep(10) + time.sleep(15) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 8b8fefe..56cbe22 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -20,12 +20,18 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - qc_metadata(args.metadata.name) + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): + target = "sequence.fasta" + elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + target = "reads.fastq" + + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: @@ -52,5 +58,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) + print("Done") + if __name__ == "__main__": main() diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 78b31b2..ebe4dfc 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,6 +1,7 @@ import schema_salad.schema import logging import pkg_resources +import logging def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") @@ -17,5 +18,6 @@ def qc_metadata(metadatafile): try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True - except: - return False + except Exception as e: + logging.warn(e) + return False -- cgit v1.2.3 From d7498093d0f5e0db052ef88815d57c2648d09425 Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 14:34:54 +0200 Subject: Updating schema and examples. This is still work in progress but we get there --- bh20sequploader/bh20seq-schema.yml | 60 ++++++++++++++++++++++++++------------ example/metadata.yaml | 49 ++++++++++++++++++++++--------- example/minimal_example.yaml | 14 +++++++++ 3 files changed, 91 insertions(+), 32 deletions(-) create mode 100644 example/minimal_example.yaml diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 6e0973a..38cfb48 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,36 +1,60 @@ $graph: -- name: sampleInformationSchema +- name: hostSchema type: record fields: - location: string - host: string - sequenceTechnology: string - assemblyMethod: string + host_id: string + host_species: string + host_common_name: string? + host_sex: string? + host_age: int? + host_age_unit: string? + host_health_status: string? + host_treatment: string? + additional_host_information: string? -- name: InstituteInformationSchema +- name: sampleSchema type: record fields: - OriginatingLab: string - SubmittingLab: string + collector_name: string + collecting_institution: string + specimen_source: string? + collection_date: string? + collection_location: string? + sample_storage_conditions: string? + additional_collection_information: string? -- name: SubmitterInformationSchema +- name: virusSchema type: record fields: - Submitter: string - submissionDate: string + virus_species: string? + virus_strain: string? -- name: VirusDetailSchema +- name: technologySchema type: record fields: - VirusName: string - AccessionId: string + sample_sequencing_technology: string + sequence_assembly_method: string? + sequencing_coverage: string? + +- name: submitterSchema + type: record + fields: + submitter_name: string + submitter_address: string? + originating_lab: string + lab_address: string? + provider_sample_id: string? + submitter_sample_id: string? + authors: string? + submitter_id: string? - name: MainSchema type: record documentRoot: true fields: - sampleInformation: sampleInformationSchema - InstituteInformation: InstituteInformationSchema - SubmitterInformation: SubmitterInformationSchema - VirusDetail: VirusDetailSchema + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml index 587d0be..8a93379 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,17 +1,38 @@ -sampleInformation: - location: "USA" - host : "Homo Sapiens" - sequenceTechnology: "Sanger" - assemblyMethod: "CLC Genomics" +host: + host_id: XX1 + host_species: string + host_common_name: string + host_sex: string + host_age: 20 + host_age_unit: string + host_health_status: string + host_treatment: string + additional_host_information: string -InstituteInformation: - OriginatingLab: "Erik's kitchen" - SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" +sample: + collector_name: XXX + collecting_institution: XXX + specimen_source: XXX + collection_date: XXX + collection_location: XXX + sample_storage_conditions: XXX + additional_collection_information: XXX -SubmitterInformation: - Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" - submissionDate: "04-04-2020" +virus: + virus_species: XX + virus_strain: XX -VirusDetail: - VirusName: "hCoV-19/USA/identifer/2020" - AccessionId: "EPI_ISL_Random" +technology: + sample_sequencing_technology: XX + sequence_assembly_method: XX + sequencing_coverage: 70x + +submitter: + submitter_name: tester + submitter_address: testerAdd + originating_lab: testLab + lab_address: labAdd + provider_sample_id: string + submitter_sample_id: string + authors: testAuthor + submitter_id: X12 \ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml new file mode 100644 index 0000000..201b080 --- /dev/null +++ b/example/minimal_example.yaml @@ -0,0 +1,14 @@ +host: + host_id: XX + host_species: string + +sample: + collector_name: XXX + collecting_institution: XXX + +technology: + sample_sequencing_technology: XX + +submitter: + submitter_name: tester + originating_lab: testLab \ No newline at end of file -- cgit v1.2.3 From deedb2ed7046bbe81136b8d9d1edc353984d356b Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 20:38:55 +0200 Subject: Adding functionality of turning keys into ontology terms (URI). This is work in progress - of course! --- bh20sequploader/bh20seq-schema.yml | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 38cfb48..fd9e854 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -3,14 +3,20 @@ $graph: - name: hostSchema type: record fields: + host_species: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 host_id: string - host_species: string host_common_name: string? host_sex: string? host_age: int? host_age_unit: string? host_health_status: string? - host_treatment: string? + host_treatment: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 additional_host_information: string? - name: sampleSchema @@ -20,7 +26,10 @@ $graph: collecting_institution: string specimen_source: string? collection_date: string? - collection_location: string? + collection_location: + type: string? + jsonldPredicate: + _id: https://schema.org/fromLocation sample_storage_conditions: string? additional_collection_information: string? @@ -33,9 +42,18 @@ $graph: - name: technologySchema type: record fields: - sample_sequencing_technology: string - sequence_assembly_method: string? - sequencing_coverage: string? + sample_sequencing_technology: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + sequence_assembly_method: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 - name: submitterSchema type: record -- cgit v1.2.3 From dbe094a150d6c969b3d69f112b3538e6a87a74a2 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 9 Apr 2020 15:59:46 -0400 Subject: Add "sequencefile" for the metadata subject. --- bh20sequploader/bh20seq-schema.yml | 13 ++++++++++++- example/metadata.yaml | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index fd9e854..5c962d1 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,3 +1,8 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ $graph: - name: hostSchema @@ -75,4 +80,10 @@ $graph: sample: sampleSchema virus: virusSchema? technology: technologySchema - submitter: submitterSchema \ No newline at end of file + submitter: submitterSchema + sequencefile: + doc: The subject (eg the fasta/fastq file) that this metadata describes + type: string? + jsonldPredicate: + _id: "@id" + _type: "@id" diff --git a/example/metadata.yaml b/example/metadata.yaml index 8a93379..41ff93e 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -35,4 +35,4 @@ submitter: provider_sample_id: string submitter_sample_id: string authors: testAuthor - submitter_id: X12 \ No newline at end of file + submitter_id: X12 -- cgit v1.2.3