From afbc3ec99f638a2f8df96a8e952b5b9616dc99a8 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 13:31:49 -0400 Subject: Now moves collections into 'validated sequences' project Improve logging for seq service Fix uploader bug Runs workflow with all validated sequences. --- bh20seqanalyzer/main.py | 83 +++++++++++++++++++++++++++++++++++++++---------- bh20sequploader/main.py | 3 +- setup.py | 2 +- 3 files changed, 69 insertions(+), 19 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 23e58e9..dae8eca 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -1,29 +1,70 @@ import argparse import arvados +import arvados.collection import time import subprocess import tempfile import json +import logging + +logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) +logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) + +def validate_upload(api, collection, validated_project): + col = arvados.collection.Collection(collection["uuid"]) + + # validate the collection here. Check metadata, etc. + valid = True + + if "sequence.fasta" not in col: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) + if "metadata.jsonld" not in col: + logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + valid = False + + dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], + ["portable_data_hash", "=", col.portable_data_hash()]]).execute() + if dup["items"]: + # This exact collection has been uploaded before. + valid = False + logging.warn("Upload '%s' is duplicate" % collection["name"]) + + if valid: + logging.info("Added '%s' to validated sequences" % collection["name"]) + # Move it to the "validated" project to be included in the next analysis + api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + else: + # It is invalid, delete it. + logging.warn("Deleting '%s'" % collection["name"]) + api.collections().delete(uuid=collection["uuid"]).execute() + + return valid + +def start_analysis(api, + analysis_project, + workflow_uuid, + validated_project): -def start_analysis(api, collection, analysis_project, workflow_uuid): project = api.groups().create(body={ "group_class": "project", - "name": "Analysis of %s" % collection["name"], + "name": "Pangenome analysis", "owner_uuid": analysis_project, }, ensure_unique_name=True).execute() + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + with tempfile.NamedTemporaryFile() as tmp: - inputobj = json.dumps({ - "sequence": { + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ "class": "File", - "location": "keep:%s/sequence.fasta" % collection["portable_data_hash"] - }, - "metadata": { - "class": "File", - "location": "keep:%s/metadata.jsonld" % collection["portable_data_hash"] - } - }, indent=2) - tmp.write(inputobj.encode('utf-8')) + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", "--submit", @@ -32,24 +73,32 @@ def start_analysis(api, collection, analysis_project, workflow_uuid): "--project-uuid=%s" % project["uuid"], "arvwf:%s" % workflow_uuid, tmp.name] - print("Running %s" % ' '.join(cmd)) + logging.info("Running %s" % ' '.join(cmd)) comp = subprocess.run(cmd, capture_output=True) if comp.returncode != 0: - print(comp.stderr.decode('utf-8')) - else: - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": project['uuid']}).execute() + logging.error(comp.stderr.decode('utf-8')) + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') args = parser.parse_args() api = arvados.api() + logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) + while True: new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() + at_least_one_new_valid_seq = False for c in new_collections["items"]: - start_analysis(api, c, args.analysis_project, args.workflow_uuid) + at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + + if at_least_one_new_valid_seq: + start_analysis(api, args.analysis_project, + args.workflow_uuid, + args.validated_project) time.sleep(10) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -49,4 +49,5 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 9e73ff0..0685d37 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import setuptools.command.egg_info as egg_info_cmd from setuptools import setup SETUP_DIR = os.path.dirname(__file__) -README = os.path.join(SETUP_DIR, "README.rst") +README = os.path.join(SETUP_DIR, "README.md") try: import gittaggers -- cgit v1.2.3 From 40df65dec296b81650987c8ee4f832b703ab8f74 Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 7 Apr 2020 19:51:49 +0200 Subject: adding dummy metadata qc to the project --- bh20sequploader/qc_metadata.py | 13 +++++++++++++ example/dummyschema.yaml | 16 ++++++++++++++++ example/metadata.json | 0 example/metadata.yaml | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 bh20sequploader/qc_metadata.py create mode 100644 example/dummyschema.yaml delete mode 100644 example/metadata.json create mode 100644 example/metadata.yaml diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..0632777 --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,13 @@ +import yamale + +## NOTE: this is just a DUMMY. Everything about this can and will change +def qc_metadata(metadatafile): + print("Start metadata validation...") + schema = yamale.make_schema('../example/dummyschema.yaml') + data = yamale.make_data(metadatafile) + # Validate data against the schema. Throws a ValueError if data is invalid. + yamale.validate(schema, data) + print("...complete!") + +#qc_metadata("../example/metadata.yaml") + diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml new file mode 100644 index 0000000..e428324 --- /dev/null +++ b/example/dummyschema.yaml @@ -0,0 +1,16 @@ +#sampleInformation: include('sampleInformation') +#InstituteInformation: include('InstituteInformation') +--- +sampleInformation: + location : str() + host : str() + sequenceTechnology: str() + assemblyMethod: str() + +InstituteInformation: + OriginatingLab: str() + SubmittingLab: str() + +VirusDetail: + VirusName: str() + AccessionId: str() diff --git a/example/metadata.json b/example/metadata.json deleted file mode 100644 index e69de29..0000000 diff --git a/example/metadata.yaml b/example/metadata.yaml new file mode 100644 index 0000000..587d0be --- /dev/null +++ b/example/metadata.yaml @@ -0,0 +1,17 @@ +sampleInformation: + location: "USA" + host : "Homo Sapiens" + sequenceTechnology: "Sanger" + assemblyMethod: "CLC Genomics" + +InstituteInformation: + OriginatingLab: "Erik's kitchen" + SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" + +SubmitterInformation: + Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" + submissionDate: "04-04-2020" + +VirusDetail: + VirusName: "hCoV-19/USA/identifer/2020" + AccessionId: "EPI_ISL_Random" -- cgit v1.2.3 From 102a3663123292375440eeb04276d22a5b4645e0 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:27:10 -0400 Subject: Copy recent results to a set destination --- bh20seqanalyzer/main.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index dae8eca..2db97f6 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -11,7 +11,7 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, latest_result_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -79,12 +79,31 @@ def start_analysis(api, logging.error(comp.stderr.decode('utf-8')) +def copy_most_recent_result(api, analysis_project, latest_result_uuid): + most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], + order="created_at desc").execute() + for m in most_recent_analysis["items"]: + cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + wf = cr["items"][0] + src = api.collections().get(uuid=wf["output_uuid"]).execute() + dst = api.collections().get(uuid=latest_result_uuid).execute() + if src["portable_data_hash"] != dst["portable_data_hash"]: + logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) + api.collections().update(uuid=latest_result_uuid, + body={"manifest_text": src["manifest_text"], + "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + break + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -101,4 +120,7 @@ def main(): start_analysis(api, args.analysis_project, args.workflow_uuid, args.validated_project) + + copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + time.sleep(10) -- cgit v1.2.3 From 4215a82af730ff05b8fe98e226b759413cdf95f7 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 14:37:19 -0400 Subject: limit 1 --- bh20seqanalyzer/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2db97f6..2513ea3 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -81,7 +81,7 @@ def start_analysis(api, def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], - order="created_at desc").execute() + order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], ["requesting_container_uuid", "=", None]]).execute() -- cgit v1.2.3 From 37652786cb6605a4862e820f2ba85f2fe818952f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 7 Apr 2020 11:58:33 -0700 Subject: Make README more didactic --- README.md | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index ec9afb1..a6fe052 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,162 @@ # Sequence uploader -This repository provides a sequence uploader for the +This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -# Run +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). -Run the uploader with a FASTA file and accompanying metadata: +# Installation - python3 bh20sequploader/main.py example/sequence.fasta example/metadata.json +There are several ways to install the uploader. The most portable is with a [virtualenv](#installation-with-virtualenv). -# Add a workflow +## Installation with `virtualenv` -get your SARS-CoV-2 sequences from GenBank in seqs.fa +1. **Prepare your system.** You need to make sure you have Python, and the ability to install modules such as `pycurl` and `pyopenssl`. On Ubuntu 18.04, you can run: ```sh -minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf -seqwish -s seqs.fa -p seqs.paf -g seqs.gfa -odgi build -g seqs.gfa -s -o seqs.odgi -odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev ``` -from https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes +2. **Create and enter your virtualenv.** Go to some memorable directory and make and enter a virtualenv: -# Installation +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +3. **Install the tool.** Once in your virtualenv, install this project: + +```sh +pip3 install git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +**Make sure you are in your virtualenv whenever you run the tool!** If you ever can't run the tool, and your prompt doesn't say `(venv)`, try going to the directory where you put the virtualenv and running `. venv/bin/activate`. It only works for the current terminal window; you will need to run it again if you open a new terminal. + +## Installation with `pip3 --user` + +If you don't want to have to enter a virtualenv every time you use the uploader, you can use the `--user` feature of `pip3` to install the tool for your user. + +1. **Prepare your system.** Just as for the `virtualenv` method, you need to install some dependencies. On Ubuntu 18.04, you can run: + +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Install the tool.** You can run: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +3. **Make sure the tool is on your `PATH`.** THe `pip3` command will install the uploader in `.local/bin` inside your home directory. Your shell may not know to look for commands there by default. To fix this for the terminal you currently have open, run: + +```sh +export PATH=$PATH:$HOME/.local/bin +``` + +To make this change permanent, assuming your shell is Bash, run: + +```sh +echo 'export PATH=$PATH:$HOME/.local/bin' >>~/.bashrc +``` + +4. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. -This tool requires the arvados Python module which can be installed -using .deb or .rpm packages through -https://doc.arvados.org/v2.0/sdk/python/sdk-python.html. The actual -code lives [here](https://github.com/arvados/arvados/tree/master/sdk/python) and -suggests a local install using +## Installation from Source for Development - apt-get install libcurl4-openssl-dev libssl1.0-dev - pip3 install --user arvados-python-client +If you plan to contribute to the project, you may want to install an editable copy from source. With this method, changes to the source code are automatically reflected in the installed copy of the tool. -Next update +1. **Prepare your system.** On Ubuntu 18.04, you can run: - export PATH=$PATH:$HOME/.local/bin +```sh +sudo apt update +sudo apt install -y virtualenv git libcurl4-openssl-dev build-essential python3-dev libssl-dev +``` + +2. **Clone and enter the repository.** You can run: + +```sh +git clone https://github.com/arvados/bh20-seq-resource.git +cd bh20-seq-resource +``` + +3. **Create and enter a virtualenv.** Go to some memorable directory and make and enter a virtualenv: + +```sh +virtualenv --python python3 venv +. venv/bin/activate +``` + +Note that you will need to repeat the `. venv/bin/activate` step from this directory to enter your virtualenv whenever you want to use the installed tool. + +4. **Install the checked-out repository in editable mode.** Once in your virtualenv, install with this special pip command: + +```sh +pip3 install -e . +``` + +5. **Test the tool.** Try running: + +```sh +bh20-seq-uploader --help +``` + +It should print some instructions about how to use the uploader. + +## Installation with GNU Guix -## Install with GNU Guix +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. -Set up a container: +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master +``` + +# Usage + +Run the uploader with a FASTA file and accompanying metadata file in [JSON-LD format](https://json-ld.org/): + +```sh +bh20-seq-uploader example/sequence.fasta example/metadata.json +``` + +## Workflow for Generating a Pangenome + +All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. + +Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: + +```sh +minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf +seqwish -s seqs.fa -p seqs.paf -g seqs.gfa +odgi build -g seqs.gfa -s -o seqs.odgi +odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 +``` - ~/opt/guix/bin/guix environment -C guix --ad-hoc python openssl python-pycurl nss-certs - pip3 install --user arvados-python-client +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). -Pip installed the following modules - arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 -- cgit v1.2.3 From 07bc4c65535437b8e9e0744f08da8cea541d0116 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Tue, 7 Apr 2020 15:28:42 -0400 Subject: Add metadata validation with schema-salad --- bh20seqanalyzer/main.py | 11 ++++++++--- bh20sequploader/bh20seq-schema.yml | 36 ++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 +++++-- bh20sequploader/qc_metadata.py | 26 +++++++++++++++++--------- example/dummyschema.yaml | 16 ---------------- setup.py | 3 ++- 6 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 bh20sequploader/bh20seq-schema.yml delete mode 100644 example/dummyschema.yaml diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2513ea3..78e32c9 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -6,12 +6,14 @@ import subprocess import tempfile import json import logging +import ruamel.yaml +from bh20sequploader.qc_metadata import qc_metadata logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project, latest_result_uuid): +def validate_upload(api, collection, validated_project): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. @@ -20,9 +22,12 @@ def validate_upload(api, collection, validated_project, latest_result_uuid): if "sequence.fasta" not in col: valid = False logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) - if "metadata.jsonld" not in col: - logging.warn("Upload '%s' missing metadata.jsonld", collection["name"]) + if "metadata.yaml" not in col: + logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False + else: + metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) + valid = qc_metadata(metadata_content) and valid dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..6e0973a --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,36 @@ +$graph: + +- name: sampleInformationSchema + type: record + fields: + location: string + host: string + sequenceTechnology: string + assemblyMethod: string + +- name: InstituteInformationSchema + type: record + fields: + OriginatingLab: string + SubmittingLab: string + +- name: SubmitterInformationSchema + type: record + fields: + Submitter: string + submissionDate: string + +- name: VirusDetailSchema + type: record + fields: + VirusName: string + AccessionId: string + +- name: MainSchema + type: record + documentRoot: true + fields: + sampleInformation: sampleInformationSchema + InstituteInformation: InstituteInformationSchema + SubmitterInformation: SubmitterInformationSchema + VirusDetail: VirusDetailSchema diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index d3ebc0c..8b8fefe 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,6 +20,8 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + qc_metadata(args.metadata.name) + col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -29,8 +32,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 0632777..78b31b2 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,13 +1,21 @@ -import yamale +import schema_salad.schema +import logging +import pkg_resources -## NOTE: this is just a DUMMY. Everything about this can and will change def qc_metadata(metadatafile): - print("Start metadata validation...") - schema = yamale.make_schema('../example/dummyschema.yaml') - data = yamale.make_data(metadatafile) - # Validate data against the schema. Throws a ValueError if data is invalid. - yamale.validate(schema, data) - print("...complete!") + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) -#qc_metadata("../example/metadata.yaml") + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except: + return False diff --git a/example/dummyschema.yaml b/example/dummyschema.yaml deleted file mode 100644 index e428324..0000000 --- a/example/dummyschema.yaml +++ /dev/null @@ -1,16 +0,0 @@ -#sampleInformation: include('sampleInformation') -#InstituteInformation: include('InstituteInformation') ---- -sampleInformation: - location : str() - host : str() - sequenceTechnology: str() - assemblyMethod: str() - -InstituteInformation: - OriginatingLab: str() - SubmittingLab: str() - -VirusDetail: - VirusName: str() - AccessionId: str() diff --git a/setup.py b/setup.py index 0685d37..48c25aa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client"] +install_requires = ["arvados-python-client", "schema-salad"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -30,6 +30,7 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer"], + package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], -- cgit v1.2.3 From 14ff178ed7f77a996f47e2115e2a1429f6b69356 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 12:12:49 -0700 Subject: Spell correctly --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6fe052..1448f4c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This repository provides a sequence uploader for the COVID-19 Virtual Biohackathon's Public Sequence Resource project. You can use it to upload the genomes of SARS-CoV-2 samples to make them publicly and freely available to other researchers. -To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [uplaod your data](#usage). +To get started, first [install the uploader](#installation), and use the `bh20-seq-uploader` command to [upload your data](#usage). # Installation -- cgit v1.2.3 From 414c308b8860d1b20481a2ec3b2f6381e4f6061b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 14:11:39 -0700 Subject: Initial commit of working frontend --- __pycache__/main.cpython-36.pyc | Bin 0 -> 2716 bytes main.py | 98 ++++++++++++++++++++++++++++++++++++++++ pages/index.html | 28 ++++++++++++ templates/error.html | 19 ++++++++ templates/success.html | 24 ++++++++++ 5 files changed, 169 insertions(+) create mode 100644 __pycache__/main.cpython-36.pyc create mode 100644 main.py create mode 100644 pages/index.html create mode 100644 templates/error.html create mode 100644 templates/success.html diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc new file mode 100644 index 0000000..250c562 Binary files /dev/null and b/__pycache__/main.cpython-36.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..630669c --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +import tempfile +import shutil +import subprocess +import os +from flask import Flask, request, redirect, send_file, send_from_directory, render_template + +app = Flask(__name__, static_url_path='/static', static_folder='static') + +# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. +# We will enforce the limit ourselves and set a higher safety limit here. +app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 + +# When a file is too big we get a 413. +@app.errorhandler(413) +def handle_large_file(e): + return (render_template('error.html', + error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + +@app.route('/') +def send_form(): + """ + Send the file upload form/front page. + """ + return send_from_directory('pages', 'index.html') + +class FileTooBigError(RuntimeError): + """ + Raised when the user gives a file that is too large. + """ + pass + +def copy_with_limit(in_file, out_file, limit=1024*1024): + """ + Copy a file stream, and raise FileTooBigError if the file is too big. + """ + + bytes_used = 0 + buf_size = 65536 + + buf = in_file.read(buf_size) + bytes_used += len(buf) + while buf: + if bytes_used > limit: + raise FileTooBigError('Hit file length limit') + out_file.write(buf) + buf = in_file.read(buf_size) + bytes_used += len(buf) + + +@app.route('/submit', methods=['POST']) +def recieve_files(): + """ + Recieve the uploaded files. + """ + + # We're going to work in one directory per request + dest_dir = tempfile.mkdtemp() + try: + + print(request) + print(request.files) + + if 'fasta' not in request.files: + return (render_template('error.html', + error_message="You did not include a FASTA file."), 403) + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') + + try: + with open(fasta_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('fasta').stream, out_stream) + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + + # Try and upload files to Arvados + result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode != 0: + # It didn't work. Complain. + error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) + return (render_template('error.html', error_message=error_message), 403) + else: + # It worked. Say so. + return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + finally: + shutil.rmtree(dest_dir) + + + + diff --git a/pages/index.html b/pages/index.html new file mode 100644 index 0000000..2269791 --- /dev/null +++ b/pages/index.html @@ -0,0 +1,28 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</title> + </head> + <body> + <h1>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</h1> + <hr> + <p> + This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. + </p> + <hr> + <form action="/submit" method="POST" enctype="multipart/form-data"> + <label for="fasta">Select FASTA file for assembled genome (max 1MB):</label> + <br> + <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna"> + <br> + <label for="metadata">Select JSON-LD metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> + <br> + <input type="file" id="metadata" name="metadata" accept=".json"> + <br> + <input type="submit" value="Add to Pangenome"> + </form> + <hr> + </body> +</html> diff --git a/templates/error.html b/templates/error.html new file mode 100644 index 0000000..c2ab0a4 --- /dev/null +++ b/templates/error.html @@ -0,0 +1,19 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Upload Failed</title> + </head> + <body> + <h1>Upload Failed</h1> + <hr> + <p> + Your upload has failed. {{error_message}} + </p> + <p> + <a href="/">Click here to try again.</a> + </p> + <hr> + </body> +</html> diff --git a/templates/success.html b/templates/success.html new file mode 100644 index 0000000..1be7861 --- /dev/null +++ b/templates/success.html @@ -0,0 +1,24 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Upload Successful</title> + </head> + <body> + <h1>Upload Successful</h1> + <hr> + <p> + Your files have been uploaded. They should soon appear as part of the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. + </p> + <p> + The upload log was: + </p> + <pre>{{log}}</pre> + <hr> + <p> + <a href="/">Click here to upload more files.</a> + </p> + <hr> + </body> +</html> -- cgit v1.2.3 From 9458ed33da08c787c4bb20af7b4108c93334b351 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Wed, 8 Apr 2020 17:41:19 -0400 Subject: Fastq now runs through fastq2fasta pipeline then gets added to pangenome analysis. --- bh20seqanalyzer/main.py | 141 ++++++++++++++++++++++++++++++----------- bh20sequploader/main.py | 14 +++- bh20sequploader/qc_metadata.py | 6 +- 3 files changed, 120 insertions(+), 41 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 78e32c9..1a8965b 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -13,21 +13,30 @@ logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="% level=logging.INFO) logging.getLogger("googleapiclient.discovery").setLevel(logging.WARN) -def validate_upload(api, collection, validated_project): +def validate_upload(api, collection, validated_project, + fastq_project, fastq_workflow_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. valid = True - if "sequence.fasta" not in col: - valid = False - logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) if "metadata.yaml" not in col: logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) - valid = qc_metadata(metadata_content) and valid + #valid = qc_metadata(metadata_content) and valid + if not valid: + logging.warn("Failed metadata qc") + + if valid: + if "sequence.fasta" not in col: + if "reads.fastq" in col: + start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) + return False + else: + valid = False + logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() @@ -39,7 +48,9 @@ def validate_upload(api, collection, validated_project): if valid: logging.info("Added '%s' to validated sequences" % collection["name"]) # Move it to the "validated" project to be included in the next analysis - api.collections().update(uuid=collection["uuid"], body={"owner_uuid": validated_project}).execute() + api.collections().update(uuid=collection["uuid"], body={ + "owner_uuid": validated_project, + "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute() else: # It is invalid, delete it. logging.warn("Deleting '%s'" % collection["name"]) @@ -47,28 +58,15 @@ def validate_upload(api, collection, validated_project): return valid -def start_analysis(api, - analysis_project, - workflow_uuid, - validated_project): +def run_workflow(api, parent_project, workflow_uuid, name, inputobj): project = api.groups().create(body={ "group_class": "project", - "name": "Pangenome analysis", - "owner_uuid": analysis_project, + "name": name, + "owner_uuid": parent_project, }, ensure_unique_name=True).execute() - validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) - with tempfile.NamedTemporaryFile() as tmp: - inputobj = { - "inputReads": [] - } - for v in validated: - inputobj["inputReads"].append({ - "class": "File", - "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] - }) tmp.write(json.dumps(inputobj, indent=2).encode('utf-8')) tmp.flush() cmd = ["arvados-cwl-runner", @@ -83,32 +81,95 @@ def start_analysis(api, if comp.returncode != 0: logging.error(comp.stderr.decode('utf-8')) + return project + + +def start_fastq_to_fasta(api, collection, + analysis_project, + fastq_workflow_uuid): + newproject = run_workflow(api, analysis_project, fastq_workflow_uuid, "FASTQ to FASTA", { + "fastq_forward": { + "class": "File", + "location": "keep:%s/reads.fastq" % collection["portable_data_hash"] + }, + "metadata": { + "class": "File", + "location": "keep:%s/metadata.yaml" % collection["portable_data_hash"] + }, + "ref_fasta": { + "class": "File", + "location": "keep:ffef6a3b77e5e04f8f62a7b6f67264d1+556/SARS-CoV2-NC_045512.2.fasta" + } + }) + api.collections().update(uuid=collection["uuid"], + body={"owner_uuid": newproject["uuid"]}).execute() + +def start_pangenome_analysis(api, + analysis_project, + pangenome_workflow_uuid, + validated_project): + validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) + inputobj = { + "inputReads": [] + } + for v in validated: + inputobj["inputReads"].append({ + "class": "File", + "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] + }) + run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) + + +def get_workflow_output_from_project(api, uuid): + cr = api.container_requests().list(filters=[['owner_uuid', '=', uuid], + ["requesting_container_uuid", "=", None]]).execute() + if cr["items"] and cr["items"][0]["output_uuid"]: + return cr["items"][0] + else: + return None + def copy_most_recent_result(api, analysis_project, latest_result_uuid): most_recent_analysis = api.groups().list(filters=[['owner_uuid', '=', analysis_project]], order="created_at desc", limit=1).execute() for m in most_recent_analysis["items"]: - cr = api.container_requests().list(filters=[['owner_uuid', '=', m["uuid"]], - ["requesting_container_uuid", "=", None]]).execute() - if cr["items"] and cr["items"][0]["output_uuid"]: - wf = cr["items"][0] + wf = get_workflow_output_from_project(api, m["uuid"]) + if wf: src = api.collections().get(uuid=wf["output_uuid"]).execute() dst = api.collections().get(uuid=latest_result_uuid).execute() if src["portable_data_hash"] != dst["portable_data_hash"]: logging.info("Copying latest result from '%s' to %s", m["name"], latest_result_uuid) api.collections().update(uuid=latest_result_uuid, body={"manifest_text": src["manifest_text"], - "description": "latest result from %s %s" % (m["name"], wf["uuid"])}).execute() + "description": "Result from %s %s" % (m["name"], wf["uuid"])}).execute() break +def move_fastq_to_fasta_results(api, analysis_project, uploader_project): + projects = api.groups().list(filters=[['owner_uuid', '=', analysis_project], + ["properties.moved_output", "!=", True]], + order="created_at desc",).execute() + for p in projects["items"]: + wf = get_workflow_output_from_project(api, p["uuid"]) + if wf: + logging.info("Moving completed fastq2fasta result %s back to uploader project", wf["output_uuid"]) + api.collections().update(uuid=wf["output_uuid"], + body={"owner_uuid": uploader_project}).execute() + p["properties"]["moved_output"] = True + api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute() + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') - parser.add_argument('--analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--pangenome-analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') + parser.add_argument('--fastq-project', type=str, default='lugli-j7d0g-xcjxp4oox2u1w8u', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') - parser.add_argument('--workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') - parser.add_argument('--latest-result-uuid', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') + + parser.add_argument('--pangenome-workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') + parser.add_argument('--fastq-workflow-uuid', type=str, default='lugli-7fd4e-2zp9q4jo5xpif9y', help='') + + parser.add_argument('--latest-result-collection', type=str, default='lugli-4zz18-z513nlpqm03hpca', help='') args = parser.parse_args() api = arvados.api() @@ -116,16 +177,24 @@ def main(): logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) while True: + move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project) + new_collections = api.collections().list(filters=[['owner_uuid', '=', args.uploader_project]]).execute() at_least_one_new_valid_seq = False for c in new_collections["items"]: - at_least_one_new_valid_seq = validate_upload(api, c, args.validated_project) or at_least_one_new_valid_seq + at_least_one_new_valid_seq = validate_upload(api, c, + args.validated_project, + args.fastq_project, + args.fastq_workflow_uuid) or at_least_one_new_valid_seq if at_least_one_new_valid_seq: - start_analysis(api, args.analysis_project, - args.workflow_uuid, - args.validated_project) + start_pangenome_analysis(api, + args.pangenome_analysis_project, + args.pangenome_workflow_uuid, + args.validated_project) - copy_most_recent_result(api, args.analysis_project, args.latest_result_uuid) + copy_most_recent_result(api, + args.pangenome_analysis_project, + args.latest_result_collection) - time.sleep(10) + time.sleep(15) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 8b8fefe..56cbe22 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -20,12 +20,18 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - qc_metadata(args.metadata.name) + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): + target = "sequence.fasta" + elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + target = "reads.fastq" + + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: @@ -52,5 +58,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) + print("Done") + if __name__ == "__main__": main() diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 78b31b2..ebe4dfc 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,6 +1,7 @@ import schema_salad.schema import logging import pkg_resources +import logging def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") @@ -17,5 +18,6 @@ def qc_metadata(metadatafile): try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True - except: - return False + except Exception as e: + logging.warn(e) + return False -- cgit v1.2.3 From ce80c29ef5c93aed80ab3b98a3c2eedb740e32b6 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 15:07:39 -0700 Subject: Don't assert that the metadata is really JSON-LD --- pages/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pages/index.html b/pages/index.html index 2269791..c2e5b64 100644 --- a/pages/index.html +++ b/pages/index.html @@ -17,7 +17,7 @@ <br> <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna"> <br> - <label for="metadata">Select JSON-LD metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> + <label for="metadata">Select JSON metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> <br> <input type="file" id="metadata" name="metadata" accept=".json"> <br> -- cgit v1.2.3 From 60420f991a5bd3502bc6b89747d408da0d922839 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 8 Apr 2020 15:11:51 -0700 Subject: Add context links --- pages/index.html | 1 + 1 file changed, 1 insertion(+) diff --git a/pages/index.html b/pages/index.html index c2e5b64..543ab7d 100644 --- a/pages/index.html +++ b/pages/index.html @@ -24,5 +24,6 @@ <input type="submit" value="Add to Pangenome"> </form> <hr> + <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> </body> </html> -- cgit v1.2.3 From d7498093d0f5e0db052ef88815d57c2648d09425 Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 14:34:54 +0200 Subject: Updating schema and examples. This is still work in progress but we get there --- bh20sequploader/bh20seq-schema.yml | 60 ++++++++++++++++++++++++++------------ example/metadata.yaml | 49 ++++++++++++++++++++++--------- example/minimal_example.yaml | 14 +++++++++ 3 files changed, 91 insertions(+), 32 deletions(-) create mode 100644 example/minimal_example.yaml diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 6e0973a..38cfb48 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,36 +1,60 @@ $graph: -- name: sampleInformationSchema +- name: hostSchema type: record fields: - location: string - host: string - sequenceTechnology: string - assemblyMethod: string + host_id: string + host_species: string + host_common_name: string? + host_sex: string? + host_age: int? + host_age_unit: string? + host_health_status: string? + host_treatment: string? + additional_host_information: string? -- name: InstituteInformationSchema +- name: sampleSchema type: record fields: - OriginatingLab: string - SubmittingLab: string + collector_name: string + collecting_institution: string + specimen_source: string? + collection_date: string? + collection_location: string? + sample_storage_conditions: string? + additional_collection_information: string? -- name: SubmitterInformationSchema +- name: virusSchema type: record fields: - Submitter: string - submissionDate: string + virus_species: string? + virus_strain: string? -- name: VirusDetailSchema +- name: technologySchema type: record fields: - VirusName: string - AccessionId: string + sample_sequencing_technology: string + sequence_assembly_method: string? + sequencing_coverage: string? + +- name: submitterSchema + type: record + fields: + submitter_name: string + submitter_address: string? + originating_lab: string + lab_address: string? + provider_sample_id: string? + submitter_sample_id: string? + authors: string? + submitter_id: string? - name: MainSchema type: record documentRoot: true fields: - sampleInformation: sampleInformationSchema - InstituteInformation: InstituteInformationSchema - SubmitterInformation: SubmitterInformationSchema - VirusDetail: VirusDetailSchema + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml index 587d0be..8a93379 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,17 +1,38 @@ -sampleInformation: - location: "USA" - host : "Homo Sapiens" - sequenceTechnology: "Sanger" - assemblyMethod: "CLC Genomics" +host: + host_id: XX1 + host_species: string + host_common_name: string + host_sex: string + host_age: 20 + host_age_unit: string + host_health_status: string + host_treatment: string + additional_host_information: string -InstituteInformation: - OriginatingLab: "Erik's kitchen" - SubmittingLab: "National Institute for Viral Disease Control and Prevention, China CDC" +sample: + collector_name: XXX + collecting_institution: XXX + specimen_source: XXX + collection_date: XXX + collection_location: XXX + sample_storage_conditions: XXX + additional_collection_information: XXX -SubmitterInformation: - Submitter: "National Institute for Viral Disease Control and Prevention, China CDC" - submissionDate: "04-04-2020" +virus: + virus_species: XX + virus_strain: XX -VirusDetail: - VirusName: "hCoV-19/USA/identifer/2020" - AccessionId: "EPI_ISL_Random" +technology: + sample_sequencing_technology: XX + sequence_assembly_method: XX + sequencing_coverage: 70x + +submitter: + submitter_name: tester + submitter_address: testerAdd + originating_lab: testLab + lab_address: labAdd + provider_sample_id: string + submitter_sample_id: string + authors: testAuthor + submitter_id: X12 \ No newline at end of file diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml new file mode 100644 index 0000000..201b080 --- /dev/null +++ b/example/minimal_example.yaml @@ -0,0 +1,14 @@ +host: + host_id: XX + host_species: string + +sample: + collector_name: XXX + collecting_institution: XXX + +technology: + sample_sequencing_technology: XX + +submitter: + submitter_name: tester + originating_lab: testLab \ No newline at end of file -- cgit v1.2.3 From deedb2ed7046bbe81136b8d9d1edc353984d356b Mon Sep 17 00:00:00 2001 From: lltommy Date: Thu, 9 Apr 2020 20:38:55 +0200 Subject: Adding functionality of turning keys into ontology terms (URI). This is work in progress - of course! --- bh20sequploader/bh20seq-schema.yml | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 38cfb48..fd9e854 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -3,14 +3,20 @@ $graph: - name: hostSchema type: record fields: + host_species: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 host_id: string - host_species: string host_common_name: string? host_sex: string? host_age: int? host_age_unit: string? host_health_status: string? - host_treatment: string? + host_treatment: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 additional_host_information: string? - name: sampleSchema @@ -20,7 +26,10 @@ $graph: collecting_institution: string specimen_source: string? collection_date: string? - collection_location: string? + collection_location: + type: string? + jsonldPredicate: + _id: https://schema.org/fromLocation sample_storage_conditions: string? additional_collection_information: string? @@ -33,9 +42,18 @@ $graph: - name: technologySchema type: record fields: - sample_sequencing_technology: string - sequence_assembly_method: string? - sequencing_coverage: string? + sample_sequencing_technology: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + sequence_assembly_method: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 - name: submitterSchema type: record -- cgit v1.2.3 From 03e857c1a477b04db11cf610760b1f2db7b859c5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:43:42 -0700 Subject: Add auto-generated fillable metadata form --- __pycache__/main.cpython-36.pyc | Bin 2716 -> 6764 bytes main.py | 191 +++++++++++++++++++++++++++++++++++++--- pages/index.html | 29 ------ templates/form.html | 95 ++++++++++++++++++++ 4 files changed, 272 insertions(+), 43 deletions(-) delete mode 100644 pages/index.html create mode 100644 templates/form.html diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc index 250c562..0f929ad 100644 Binary files a/__pycache__/main.cpython-36.pyc and b/__pycache__/main.cpython-36.pyc differ diff --git a/main.py b/main.py index 630669c..d0f2793 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,12 @@ +import collections import tempfile import shutil import subprocess import os +import re +import string +import yaml +import urllib.request from flask import Flask, request, redirect, send_file, send_from_directory, render_template app = Flask(__name__, static_url_path='/static', static_folder='static') @@ -16,12 +21,118 @@ def handle_large_file(e): return (render_template('error.html', error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + +def type_to_heading(type_name): + """ + Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. + """ + + # Remove camel case + decamel = re.sub('([A-Z])', r' \1', type_name) + # Split + parts = decamel.split() + # Capitalize words and remove unwanted components + filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] + # Reassemble + return ' '.join(filtered) + +def name_to_label(field_name): + """ + Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. + """ + + return string.capwords(field_name.replace('_', ' ')) + +def generate_form(schema): + """ + Linearize the schema and send a bunch of dicts. + Each dict either has a 'heading' (in which case we put a heading for a + form section in the template) or an 'id', 'label', 'type', and 'required' + (in which case we make a form field in the template). + """ + + # Get the list of form components, one of which is the root + components = schema.get('$graph', []) + + # Find the root + root_name = None + # And also index components by type name + by_name = {} + for component in components: + # Get the name of each + component_name = component.get('name', None) + if isinstance(component_name, str): + # And remember how to map back form it + by_name[component_name] = component + if component.get('documentRoot', False): + # Find whichever one is the root + root_name = component_name + + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): + """ + Do a traversal of the component tree. + Yield a bunch of form item dicts, in order. + Form IDs are .-separated keypaths for where they are in the structure. + parent_keys is the path of field names to where we are in the root record's document tree. + """ + + if len(parent_keys) > 1: + # First make a heading, if we aren't the very root of the form + yield {'heading': type_to_heading(type_name)} + + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): + # For each field + + ref_url = None + if not isinstance(field_type, str): + # If the type isn't a string + # See if it has a more info/what goes here URL + ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) + # Grab out its type field + field_type = field_type.get('type', '') + + # Decide if the field is optional (type ends in ?) + optional = False + if len(field_type) > 0 and field_type[-1] == '?': + # It's optional + optional = True + # Drop the ? + field_type = field_type[:-1] + + if field_type in by_name: + # This is a subrecord. We need to recurse + for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): + yield item + else: + # We know how to make a string input + record = {} + record['id'] = '.'.join(parent_keys + [field_name]) + record['label'] = name_to_label(field_name) + record['required'] = not optional and not subtree_optional + if ref_url: + record['ref_url'] = ref_url + if field_type == 'string': + record['type'] = 'text' # HTML input type + elif field_type == 'int': + record['type'] = 'number' + else: + raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) + yield record + + return list(walk_fields(root_name)) + +# At startup, we need to load the current metadata schema so we can make a form for it +METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) +FORM_ITEMS = generate_form(METADATA_SCHEMA) + @app.route('/') def send_form(): """ Send the file upload form/front page. """ - return send_from_directory('pages', 'index.html') + + return render_template('form.html', fields=FORM_ITEMS) class FileTooBigError(RuntimeError): """ @@ -46,6 +157,20 @@ def copy_with_limit(in_file, out_file, limit=1024*1024): buf = in_file.read(buf_size) bytes_used += len(buf) +def parse_input(input_string, html_type): + """ + Parse an input from the given HTML input type into a useful Python type. + + Raise ValueError if something does not parse. + Raise NotImplementedError if we forgot to implement a type. + """ + + if html_type == 'text': + return input_string + elif html_type == 'number': + return int(input_string) + else: + raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) @app.route('/submit', methods=['POST']) def recieve_files(): @@ -55,30 +180,68 @@ def recieve_files(): # We're going to work in one directory per request dest_dir = tempfile.mkdtemp() + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') try: - - print(request) - print(request.files) - if 'fasta' not in request.files: return (render_template('error.html', error_message="You did not include a FASTA file."), 403) - if 'metadata' not in request.files: - return (render_template('error.html', - error_message="You did not include a metadata file."), 403) - - fasta_dest = os.path.join(dest_dir, 'fasta.fa') - metadata_dest = os.path.join(dest_dir, 'metadata.json') - try: with open(fasta_dest, 'wb') as out_stream: copy_with_limit(request.files.get('fasta').stream, out_stream) - with open(metadata_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('metadata').stream, out_stream) except FileTooBigError as e: # Delegate to the 413 error handler return handle_large_file(e) + if request.form.get('metadata_type', None) == 'upload': + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + try: + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + elif request.form.get('metadata_type', None) == 'fill': + # Build a metadata dict + metadata = {} + + for item in FORM_ITEMS: + # Pull all the field values we wanted from the form + if 'heading' in item: + continue + + if item['id'] in request.form and len(request.form[item['id']]) > 0: + # We have this thing. Make a place in the dict tree for it. + parts = item['id'].split('.') + key = parts[-1] + # Remove leading 'metadata' + path = parts[1:-1] + dest_dict = metadata + for parent in path: + if parent not in dest_dict: + dest_dict[parent] = {} + dest_dict = dest_dict[parent] + + try: + # Now finally add the item + dest_dict[key] = parse_input(request.form[item['id']], item['type']) + except ValueError: + # We don't like that input + return (render_template('error.html', + error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) + elif item['required']: + return (render_template('error.html', + error_message="You omitted the required metadata item {}".format(item['id'])), 403) + + # Now serialize the file with all the items + with open(metadata_dest, 'w') as out_stream: + yaml.dump(metadata, out_stream) + else: + return (render_template('error.html', + error_message="You did not include metadata."), 403) + # Try and upload files to Arvados result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/pages/index.html b/pages/index.html deleted file mode 100644 index 543ab7d..0000000 --- a/pages/index.html +++ /dev/null @@ -1,29 +0,0 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</title> - </head> - <body> - <h1>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</h1> - <hr> - <p> - This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. - </p> - <hr> - <form action="/submit" method="POST" enctype="multipart/form-data"> - <label for="fasta">Select FASTA file for assembled genome (max 1MB):</label> - <br> - <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna"> - <br> - <label for="metadata">Select JSON metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> - <br> - <input type="file" id="metadata" name="metadata" accept=".json"> - <br> - <input type="submit" value="Add to Pangenome"> - </form> - <hr> - <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> - </body> -</html> diff --git a/templates/form.html b/templates/form.html new file mode 100644 index 0000000..ec54de5 --- /dev/null +++ b/templates/form.html @@ -0,0 +1,95 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</title> + </head> + <body> + <h1>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</h1> + <hr> + <p> + This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. + </p> + <hr> + <form action="/submit" method="POST" enctype="multipart/form-data" id="main_form"> + <label for="fasta">Select FASTA file for assembled genome (max 1MB):</label> + <br> + <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna" required> + <br> + + <label>Select metadata submission method:</label> + <br> + <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="setMode()" checked required> + <label for="metadata_upload">Upload metadata file</label> + <br> + <input type="radio" id="metadata_form" name="metadata_type" value="fill" onchange="setMode()" required> + <label for="metadata_form">Fill in metadata manually</label> + <br> + + <div id="metadata_upload_form_spot"> + <div id="metadata_upload_form"> + <label for="metadata">Select JSON or YAML metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> + <br> + <input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required> + <br> + </div> + </div> + + <div id="metadata_fill_form_spot"> + <div id="metadata_fill_form"> + {% for record in fields %} + {% if 'heading' in record %} + <h4>{{ record['heading'] }}</h4> + {% else %} + <label for="{{ record['id'] }}"> + {{ record['label'] }} + {{ "*" if record['required'] else "" }} + {% if 'ref_url' in record %} + <a href="{{ record['ref_url'] }}" title="More Info" target="_blank">?</a> + {% endif %} + </label> + <br> + <input type="{{ record['type'] }}" id="{{ record['id'] }}" name="{{ record['id'] }}" {{ "required" if record['required'] else "" }}> + <br> + {% endif %} + {% endfor %} + </div> + </div> + + <input type="submit" value="Add to Pangenome"> + </form> + <hr> + <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> + <script type="text/javascript"> + let uploadForm = document.getElementById('metadata_upload_form') + let uploadFormSpot = document.getElementById('metadata_upload_form_spot') + let fillForm = document.getElementById('metadata_fill_form') + let fillFormSpot = document.getElementById('metadata_fill_form_spot') + + function setUploadMode() { + // Make the upload form the one in use + uploadFormSpot.appendChild(uploadForm) + fillFormSpot.removeChild(fillForm) + } + + function setFillMode() { + // Make the fillable form the one in use + uploadFormSpot.removeChild(uploadForm) + fillFormSpot.appendChild(fillForm) + } + + function setMode() { + // Pick mode based on radio + if (document.getElementById('metadata_upload').checked) { + setUploadMode() + } else { + setFillMode() + } + } + + // Start in mode appropriate to selected form item + setMode() + </script> + </body> +</html> -- cgit v1.2.3 From 51b5686f1df140628f1b39ecf40b45fbc0d0a59a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:45:10 -0700 Subject: Don't include pyc --- __pycache__/main.cpython-36.pyc | Bin 6764 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 __pycache__/main.cpython-36.pyc diff --git a/__pycache__/main.cpython-36.pyc b/__pycache__/main.cpython-36.pyc deleted file mode 100644 index 0f929ad..0000000 Binary files a/__pycache__/main.cpython-36.pyc and /dev/null differ -- cgit v1.2.3 From 062230b12bb71c4b906318f1de3d67c0fd26f3ba Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 12:57:49 -0700 Subject: Make schema link nicer and add example files --- templates/form.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/form.html b/templates/form.html index ec54de5..4ad41e2 100644 --- a/templates/form.html +++ b/templates/form.html @@ -29,7 +29,7 @@ <div id="metadata_upload_form_spot"> <div id="metadata_upload_form"> - <label for="metadata">Select JSON or YAML metadata file following <a href="https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml">this schema</a> (max 1MB):</label> + <label for="metadata">Select JSON or YAML metadata file following <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml" target="_blank">this schema</a> (<a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/metadata.yaml" target="_blank">Example 1</a>, <a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/minimal_example.yaml" target="_blank">Example 2</a>, max 1MB):</label> <br> <input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required> <br> -- cgit v1.2.3 From dbe094a150d6c969b3d69f112b3538e6a87a74a2 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 9 Apr 2020 15:59:46 -0400 Subject: Add "sequencefile" for the metadata subject. --- bh20sequploader/bh20seq-schema.yml | 13 ++++++++++++- example/metadata.yaml | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index fd9e854..5c962d1 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,3 +1,8 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ $graph: - name: hostSchema @@ -75,4 +80,10 @@ $graph: sample: sampleSchema virus: virusSchema? technology: technologySchema - submitter: submitterSchema \ No newline at end of file + submitter: submitterSchema + sequencefile: + doc: The subject (eg the fasta/fastq file) that this metadata describes + type: string? + jsonldPredicate: + _id: "@id" + _type: "@id" diff --git a/example/metadata.yaml b/example/metadata.yaml index 8a93379..41ff93e 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -35,4 +35,4 @@ submitter: provider_sample_id: string submitter_sample_id: string authors: testAuthor - submitter_id: X12 \ No newline at end of file + submitter_id: X12 -- cgit v1.2.3 From b71cbe74aca99426872447b6dd343a962fe0a528 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 16:25:34 -0500 Subject: Spacing and typo --- main.py | 70 +++++++++++++++++++++++++++++++---------------------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/main.py b/main.py index d0f2793..0d9b37a 100644 --- a/main.py +++ b/main.py @@ -26,7 +26,7 @@ def type_to_heading(type_name): """ Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. """ - + # Remove camel case decamel = re.sub('([A-Z])', r' \1', type_name) # Split @@ -35,12 +35,12 @@ def type_to_heading(type_name): filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] # Reassemble return ' '.join(filtered) - + def name_to_label(field_name): """ Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. """ - + return string.capwords(field_name.replace('_', ' ')) def generate_form(schema): @@ -50,10 +50,10 @@ def generate_form(schema): form section in the template) or an 'id', 'label', 'type', and 'required' (in which case we make a form field in the template). """ - + # Get the list of form components, one of which is the root components = schema.get('$graph', []) - + # Find the root root_name = None # And also index components by type name @@ -67,8 +67,8 @@ def generate_form(schema): if component.get('documentRoot', False): # Find whichever one is the root root_name = component_name - - + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): """ Do a traversal of the component tree. @@ -76,14 +76,14 @@ def generate_form(schema): Form IDs are .-separated keypaths for where they are in the structure. parent_keys is the path of field names to where we are in the root record's document tree. """ - + if len(parent_keys) > 1: # First make a heading, if we aren't the very root of the form yield {'heading': type_to_heading(type_name)} - + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): # For each field - + ref_url = None if not isinstance(field_type, str): # If the type isn't a string @@ -91,7 +91,7 @@ def generate_form(schema): ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) # Grab out its type field field_type = field_type.get('type', '') - + # Decide if the field is optional (type ends in ?) optional = False if len(field_type) > 0 and field_type[-1] == '?': @@ -99,7 +99,7 @@ def generate_form(schema): optional = True # Drop the ? field_type = field_type[:-1] - + if field_type in by_name: # This is a subrecord. We need to recurse for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): @@ -119,9 +119,9 @@ def generate_form(schema): else: raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) yield record - + return list(walk_fields(root_name)) - + # At startup, we need to load the current metadata schema so we can make a form for it METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) FORM_ITEMS = generate_form(METADATA_SCHEMA) @@ -131,23 +131,23 @@ def send_form(): """ Send the file upload form/front page. """ - + return render_template('form.html', fields=FORM_ITEMS) - + class FileTooBigError(RuntimeError): """ Raised when the user gives a file that is too large. """ pass - + def copy_with_limit(in_file, out_file, limit=1024*1024): """ Copy a file stream, and raise FileTooBigError if the file is too big. """ - + bytes_used = 0 buf_size = 65536 - + buf = in_file.read(buf_size) bytes_used += len(buf) while buf: @@ -156,28 +156,28 @@ def copy_with_limit(in_file, out_file, limit=1024*1024): out_file.write(buf) buf = in_file.read(buf_size) bytes_used += len(buf) - + def parse_input(input_string, html_type): """ Parse an input from the given HTML input type into a useful Python type. - + Raise ValueError if something does not parse. Raise NotImplementedError if we forgot to implement a type. """ - + if html_type == 'text': return input_string elif html_type == 'number': return int(input_string) else: raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) - + @app.route('/submit', methods=['POST']) -def recieve_files(): +def receive_files(): """ - Recieve the uploaded files. + Receive the uploaded files. """ - + # We're going to work in one directory per request dest_dir = tempfile.mkdtemp() fasta_dest = os.path.join(dest_dir, 'fasta.fa') @@ -192,7 +192,7 @@ def recieve_files(): except FileTooBigError as e: # Delegate to the 413 error handler return handle_large_file(e) - + if request.form.get('metadata_type', None) == 'upload': if 'metadata' not in request.files: return (render_template('error.html', @@ -206,12 +206,12 @@ def recieve_files(): elif request.form.get('metadata_type', None) == 'fill': # Build a metadata dict metadata = {} - + for item in FORM_ITEMS: # Pull all the field values we wanted from the form if 'heading' in item: continue - + if item['id'] in request.form and len(request.form[item['id']]) > 0: # We have this thing. Make a place in the dict tree for it. parts = item['id'].split('.') @@ -223,7 +223,7 @@ def recieve_files(): if parent not in dest_dict: dest_dict[parent] = {} dest_dict = dest_dict[parent] - + try: # Now finally add the item dest_dict[key] = parse_input(request.form[item['id']], item['type']) @@ -234,18 +234,18 @@ def recieve_files(): elif item['required']: return (render_template('error.html', error_message="You omitted the required metadata item {}".format(item['id'])), 403) - + # Now serialize the file with all the items with open(metadata_dest, 'w') as out_stream: yaml.dump(metadata, out_stream) else: return (render_template('error.html', error_message="You did not include metadata."), 403) - + # Try and upload files to Arvados result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - + if result.returncode != 0: # It didn't work. Complain. error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) @@ -255,7 +255,3 @@ def recieve_files(): return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) finally: shutil.rmtree(dest_dir) - - - - -- cgit v1.2.3 From e110b52bae5a4a62ccc53970a7f26899e9debe7e Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 9 Apr 2020 17:48:15 -0400 Subject: Propagate metadata to pangenome so it can be merged by workflow --- bh20seqanalyzer/main.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 1a8965b..2030c1e 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -110,13 +110,20 @@ def start_pangenome_analysis(api, validated_project): validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) inputobj = { - "inputReads": [] + "inputReads": [], + "metadata": [], + "subjects": [] } for v in validated: inputobj["inputReads"].append({ "class": "File", "location": "keep:%s/sequence.fasta" % v["portable_data_hash"] }) + inputobj["metadata"].append({ + "class": "File", + "location": "keep:%s/metadata.yaml" % v["portable_data_hash"] + }) + inputobj["subjects"].append("keep:%s/sequence.fasta" % v["portable_data_hash"]) run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) -- cgit v1.2.3 From bf93a6a2fec690eee4bff4891469cd5947102b3a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 17:02:38 -0500 Subject: Moved Guix documentation into separate file (as it confused people ;) --- README.md | 21 +++++---------------- bh20sequploader/main.py | 2 +- doc/INSTALL.md | 31 +++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 doc/INSTALL.md diff --git a/README.md b/README.md index a6fe052..3a8e5f0 100644 --- a/README.md +++ b/README.md @@ -122,19 +122,7 @@ It should print some instructions about how to use the uploader. ## Installation with GNU Guix -Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. - -1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: - -```sh -~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs -``` - -2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: - -```sh -pip3 install --user git+https://github.com/arvados/bh20-seq-resource.git@master -``` +For running/developing the uploader with GNU Guix see [INSTALL.md](./doc/INSTALL.md) # Usage @@ -148,7 +136,7 @@ bh20-seq-uploader example/sequence.fasta example/metadata.json All these uploaded sequences are being fed into a workflow to generate a [pangenome](https://academic.oup.com/bib/article/19/1/118/2566735) for the virus. You can replicate this workflow yourself. -Get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run: +An example is to get your SARS-CoV-2 sequences from GenBank in `seqs.fa`, and then run a series of commands ```sh minimap2 -cx asm20 -X seqs.fa seqs.fa >seqs.paf @@ -157,6 +145,7 @@ odgi build -g seqs.gfa -s -o seqs.odgi odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 ``` -For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). - +Here we convert such a pipeline into the Common Workflow Language (CWL) and +sources can be found [here](https://github.com/hpobio-lab/viral-analysis/tree/master/cwl/pangenome-generate). +For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 56cbe22..bf74ea5 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,7 +6,7 @@ import json import urllib.request import socket import getpass -from .qc_metadata import qc_metadata +import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' diff --git a/doc/INSTALL.md b/doc/INSTALL.md new file mode 100644 index 0000000..c5c486c --- /dev/null +++ b/doc/INSTALL.md @@ -0,0 +1,31 @@ +# INSTALLATION + +Other options for running this tool. + +## GNU Guix + +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. + +1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: + +```sh +~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +``` + +2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: + +```sh +pip3 install --user schema-salad arvados-python-client +``` + +Pip installed the following modules + +``` +arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-client-1.6.7 httplib2-0.17.1 oauth2client-4.1.3 pyasn1-0.4.8 pyasn1-modules-0.2.8 rsa-4.0 ruamel.yaml-0.15.77 six-1.14.0 uritemplate-3.0.1 ws4py-0.5.1 +``` + +3. Run the tool directly with + +```sh +~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py +``` -- cgit v1.2.3 From 02615e46e56376302ef99f7223f447a070248214 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 9 Apr 2020 17:11:25 -0500 Subject: Notes --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 0d9b37a..b4e8681 100644 --- a/main.py +++ b/main.py @@ -242,7 +242,7 @@ def receive_files(): return (render_template('error.html', error_message="You did not include metadata."), 403) - # Try and upload files to Arvados + # Try and upload files to Arvados using the sequence uploader CLI result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], stdout=subprocess.PIPE, stderr=subprocess.PIPE) -- cgit v1.2.3 From 7d26be925f37b1f98cac23b018dd1a72fa506a3f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 15:41:29 -0700 Subject: Put back in directory --- bh20simplewebuploader/main.py | 257 +++++++++++++++++++++++++++ bh20simplewebuploader/templates/error.html | 19 ++ bh20simplewebuploader/templates/form.html | 95 ++++++++++ bh20simplewebuploader/templates/success.html | 24 +++ main.py | 257 --------------------------- templates/error.html | 19 -- templates/form.html | 95 ---------- templates/success.html | 24 --- 8 files changed, 395 insertions(+), 395 deletions(-) create mode 100644 bh20simplewebuploader/main.py create mode 100644 bh20simplewebuploader/templates/error.html create mode 100644 bh20simplewebuploader/templates/form.html create mode 100644 bh20simplewebuploader/templates/success.html delete mode 100644 main.py delete mode 100644 templates/error.html delete mode 100644 templates/form.html delete mode 100644 templates/success.html diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py new file mode 100644 index 0000000..b4e8681 --- /dev/null +++ b/bh20simplewebuploader/main.py @@ -0,0 +1,257 @@ +import collections +import tempfile +import shutil +import subprocess +import os +import re +import string +import yaml +import urllib.request +from flask import Flask, request, redirect, send_file, send_from_directory, render_template + +app = Flask(__name__, static_url_path='/static', static_folder='static') + +# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. +# We will enforce the limit ourselves and set a higher safety limit here. +app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 + +# When a file is too big we get a 413. +@app.errorhandler(413) +def handle_large_file(e): + return (render_template('error.html', + error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) + + +def type_to_heading(type_name): + """ + Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. + """ + + # Remove camel case + decamel = re.sub('([A-Z])', r' \1', type_name) + # Split + parts = decamel.split() + # Capitalize words and remove unwanted components + filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] + # Reassemble + return ' '.join(filtered) + +def name_to_label(field_name): + """ + Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. + """ + + return string.capwords(field_name.replace('_', ' ')) + +def generate_form(schema): + """ + Linearize the schema and send a bunch of dicts. + Each dict either has a 'heading' (in which case we put a heading for a + form section in the template) or an 'id', 'label', 'type', and 'required' + (in which case we make a form field in the template). + """ + + # Get the list of form components, one of which is the root + components = schema.get('$graph', []) + + # Find the root + root_name = None + # And also index components by type name + by_name = {} + for component in components: + # Get the name of each + component_name = component.get('name', None) + if isinstance(component_name, str): + # And remember how to map back form it + by_name[component_name] = component + if component.get('documentRoot', False): + # Find whichever one is the root + root_name = component_name + + + def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): + """ + Do a traversal of the component tree. + Yield a bunch of form item dicts, in order. + Form IDs are .-separated keypaths for where they are in the structure. + parent_keys is the path of field names to where we are in the root record's document tree. + """ + + if len(parent_keys) > 1: + # First make a heading, if we aren't the very root of the form + yield {'heading': type_to_heading(type_name)} + + for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): + # For each field + + ref_url = None + if not isinstance(field_type, str): + # If the type isn't a string + # See if it has a more info/what goes here URL + ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) + # Grab out its type field + field_type = field_type.get('type', '') + + # Decide if the field is optional (type ends in ?) + optional = False + if len(field_type) > 0 and field_type[-1] == '?': + # It's optional + optional = True + # Drop the ? + field_type = field_type[:-1] + + if field_type in by_name: + # This is a subrecord. We need to recurse + for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): + yield item + else: + # We know how to make a string input + record = {} + record['id'] = '.'.join(parent_keys + [field_name]) + record['label'] = name_to_label(field_name) + record['required'] = not optional and not subtree_optional + if ref_url: + record['ref_url'] = ref_url + if field_type == 'string': + record['type'] = 'text' # HTML input type + elif field_type == 'int': + record['type'] = 'number' + else: + raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) + yield record + + return list(walk_fields(root_name)) + +# At startup, we need to load the current metadata schema so we can make a form for it +METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) +FORM_ITEMS = generate_form(METADATA_SCHEMA) + +@app.route('/') +def send_form(): + """ + Send the file upload form/front page. + """ + + return render_template('form.html', fields=FORM_ITEMS) + +class FileTooBigError(RuntimeError): + """ + Raised when the user gives a file that is too large. + """ + pass + +def copy_with_limit(in_file, out_file, limit=1024*1024): + """ + Copy a file stream, and raise FileTooBigError if the file is too big. + """ + + bytes_used = 0 + buf_size = 65536 + + buf = in_file.read(buf_size) + bytes_used += len(buf) + while buf: + if bytes_used > limit: + raise FileTooBigError('Hit file length limit') + out_file.write(buf) + buf = in_file.read(buf_size) + bytes_used += len(buf) + +def parse_input(input_string, html_type): + """ + Parse an input from the given HTML input type into a useful Python type. + + Raise ValueError if something does not parse. + Raise NotImplementedError if we forgot to implement a type. + """ + + if html_type == 'text': + return input_string + elif html_type == 'number': + return int(input_string) + else: + raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) + +@app.route('/submit', methods=['POST']) +def receive_files(): + """ + Receive the uploaded files. + """ + + # We're going to work in one directory per request + dest_dir = tempfile.mkdtemp() + fasta_dest = os.path.join(dest_dir, 'fasta.fa') + metadata_dest = os.path.join(dest_dir, 'metadata.json') + try: + if 'fasta' not in request.files: + return (render_template('error.html', + error_message="You did not include a FASTA file."), 403) + try: + with open(fasta_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('fasta').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + + if request.form.get('metadata_type', None) == 'upload': + if 'metadata' not in request.files: + return (render_template('error.html', + error_message="You did not include a metadata file."), 403) + try: + with open(metadata_dest, 'wb') as out_stream: + copy_with_limit(request.files.get('metadata').stream, out_stream) + except FileTooBigError as e: + # Delegate to the 413 error handler + return handle_large_file(e) + elif request.form.get('metadata_type', None) == 'fill': + # Build a metadata dict + metadata = {} + + for item in FORM_ITEMS: + # Pull all the field values we wanted from the form + if 'heading' in item: + continue + + if item['id'] in request.form and len(request.form[item['id']]) > 0: + # We have this thing. Make a place in the dict tree for it. + parts = item['id'].split('.') + key = parts[-1] + # Remove leading 'metadata' + path = parts[1:-1] + dest_dict = metadata + for parent in path: + if parent not in dest_dict: + dest_dict[parent] = {} + dest_dict = dest_dict[parent] + + try: + # Now finally add the item + dest_dict[key] = parse_input(request.form[item['id']], item['type']) + except ValueError: + # We don't like that input + return (render_template('error.html', + error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) + elif item['required']: + return (render_template('error.html', + error_message="You omitted the required metadata item {}".format(item['id'])), 403) + + # Now serialize the file with all the items + with open(metadata_dest, 'w') as out_stream: + yaml.dump(metadata, out_stream) + else: + return (render_template('error.html', + error_message="You did not include metadata."), 403) + + # Try and upload files to Arvados using the sequence uploader CLI + result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if result.returncode != 0: + # It didn't work. Complain. + error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) + return (render_template('error.html', error_message=error_message), 403) + else: + # It worked. Say so. + return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) + finally: + shutil.rmtree(dest_dir) diff --git a/bh20simplewebuploader/templates/error.html b/bh20simplewebuploader/templates/error.html new file mode 100644 index 0000000..c2ab0a4 --- /dev/null +++ b/bh20simplewebuploader/templates/error.html @@ -0,0 +1,19 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Upload Failed</title> + </head> + <body> + <h1>Upload Failed</h1> + <hr> + <p> + Your upload has failed. {{error_message}} + </p> + <p> + <a href="/">Click here to try again.</a> + </p> + <hr> + </body> +</html> diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html new file mode 100644 index 0000000..4ad41e2 --- /dev/null +++ b/bh20simplewebuploader/templates/form.html @@ -0,0 +1,95 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</title> + </head> + <body> + <h1>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</h1> + <hr> + <p> + This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. + </p> + <hr> + <form action="/submit" method="POST" enctype="multipart/form-data" id="main_form"> + <label for="fasta">Select FASTA file for assembled genome (max 1MB):</label> + <br> + <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna" required> + <br> + + <label>Select metadata submission method:</label> + <br> + <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="setMode()" checked required> + <label for="metadata_upload">Upload metadata file</label> + <br> + <input type="radio" id="metadata_form" name="metadata_type" value="fill" onchange="setMode()" required> + <label for="metadata_form">Fill in metadata manually</label> + <br> + + <div id="metadata_upload_form_spot"> + <div id="metadata_upload_form"> + <label for="metadata">Select JSON or YAML metadata file following <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml" target="_blank">this schema</a> (<a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/metadata.yaml" target="_blank">Example 1</a>, <a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/minimal_example.yaml" target="_blank">Example 2</a>, max 1MB):</label> + <br> + <input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required> + <br> + </div> + </div> + + <div id="metadata_fill_form_spot"> + <div id="metadata_fill_form"> + {% for record in fields %} + {% if 'heading' in record %} + <h4>{{ record['heading'] }}</h4> + {% else %} + <label for="{{ record['id'] }}"> + {{ record['label'] }} + {{ "*" if record['required'] else "" }} + {% if 'ref_url' in record %} + <a href="{{ record['ref_url'] }}" title="More Info" target="_blank">?</a> + {% endif %} + </label> + <br> + <input type="{{ record['type'] }}" id="{{ record['id'] }}" name="{{ record['id'] }}" {{ "required" if record['required'] else "" }}> + <br> + {% endif %} + {% endfor %} + </div> + </div> + + <input type="submit" value="Add to Pangenome"> + </form> + <hr> + <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> + <script type="text/javascript"> + let uploadForm = document.getElementById('metadata_upload_form') + let uploadFormSpot = document.getElementById('metadata_upload_form_spot') + let fillForm = document.getElementById('metadata_fill_form') + let fillFormSpot = document.getElementById('metadata_fill_form_spot') + + function setUploadMode() { + // Make the upload form the one in use + uploadFormSpot.appendChild(uploadForm) + fillFormSpot.removeChild(fillForm) + } + + function setFillMode() { + // Make the fillable form the one in use + uploadFormSpot.removeChild(uploadForm) + fillFormSpot.appendChild(fillForm) + } + + function setMode() { + // Pick mode based on radio + if (document.getElementById('metadata_upload').checked) { + setUploadMode() + } else { + setFillMode() + } + } + + // Start in mode appropriate to selected form item + setMode() + </script> + </body> +</html> diff --git a/bh20simplewebuploader/templates/success.html b/bh20simplewebuploader/templates/success.html new file mode 100644 index 0000000..1be7861 --- /dev/null +++ b/bh20simplewebuploader/templates/success.html @@ -0,0 +1,24 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + <title>Upload Successful</title> + </head> + <body> + <h1>Upload Successful</h1> + <hr> + <p> + Your files have been uploaded. They should soon appear as part of the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. + </p> + <p> + The upload log was: + </p> + <pre>{{log}}</pre> + <hr> + <p> + <a href="/">Click here to upload more files.</a> + </p> + <hr> + </body> +</html> diff --git a/main.py b/main.py deleted file mode 100644 index b4e8681..0000000 --- a/main.py +++ /dev/null @@ -1,257 +0,0 @@ -import collections -import tempfile -import shutil -import subprocess -import os -import re -import string -import yaml -import urllib.request -from flask import Flask, request, redirect, send_file, send_from_directory, render_template - -app = Flask(__name__, static_url_path='/static', static_folder='static') - -# Limit file upload size. We shouldn't be working with anything over 1 MB; these are small genomes. -# We will enforce the limit ourselves and set a higher safety limit here. -app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 - -# When a file is too big we get a 413. -@app.errorhandler(413) -def handle_large_file(e): - return (render_template('error.html', - error_message="One of your files is too large. The maximum file size is 1 megabyte."), 413) - - -def type_to_heading(type_name): - """ - Turn a type name like "sampleSchema" from the metadata schema into a human-readable heading. - """ - - # Remove camel case - decamel = re.sub('([A-Z])', r' \1', type_name) - # Split - parts = decamel.split() - # Capitalize words and remove unwanted components - filtered = [part.capitalize() for part in parts if (part.lower() != 'schema' and part != '')] - # Reassemble - return ' '.join(filtered) - -def name_to_label(field_name): - """ - Turn a filed name like "host_health_status" from the metadata schema into a human-readable label. - """ - - return string.capwords(field_name.replace('_', ' ')) - -def generate_form(schema): - """ - Linearize the schema and send a bunch of dicts. - Each dict either has a 'heading' (in which case we put a heading for a - form section in the template) or an 'id', 'label', 'type', and 'required' - (in which case we make a form field in the template). - """ - - # Get the list of form components, one of which is the root - components = schema.get('$graph', []) - - # Find the root - root_name = None - # And also index components by type name - by_name = {} - for component in components: - # Get the name of each - component_name = component.get('name', None) - if isinstance(component_name, str): - # And remember how to map back form it - by_name[component_name] = component - if component.get('documentRoot', False): - # Find whichever one is the root - root_name = component_name - - - def walk_fields(type_name, parent_keys=['metadata'], subtree_optional=False): - """ - Do a traversal of the component tree. - Yield a bunch of form item dicts, in order. - Form IDs are .-separated keypaths for where they are in the structure. - parent_keys is the path of field names to where we are in the root record's document tree. - """ - - if len(parent_keys) > 1: - # First make a heading, if we aren't the very root of the form - yield {'heading': type_to_heading(type_name)} - - for field_name, field_type in by_name.get(type_name, {}).get('fields', {}).items(): - # For each field - - ref_url = None - if not isinstance(field_type, str): - # If the type isn't a string - # See if it has a more info/what goes here URL - ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) - # Grab out its type field - field_type = field_type.get('type', '') - - # Decide if the field is optional (type ends in ?) - optional = False - if len(field_type) > 0 and field_type[-1] == '?': - # It's optional - optional = True - # Drop the ? - field_type = field_type[:-1] - - if field_type in by_name: - # This is a subrecord. We need to recurse - for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): - yield item - else: - # We know how to make a string input - record = {} - record['id'] = '.'.join(parent_keys + [field_name]) - record['label'] = name_to_label(field_name) - record['required'] = not optional and not subtree_optional - if ref_url: - record['ref_url'] = ref_url - if field_type == 'string': - record['type'] = 'text' # HTML input type - elif field_type == 'int': - record['type'] = 'number' - else: - raise NotImplementedError('Unimplemented field type {} in {} in metadata schema'.format(field_type, type_name)) - yield record - - return list(walk_fields(root_name)) - -# At startup, we need to load the current metadata schema so we can make a form for it -METADATA_SCHEMA = yaml.safe_load(urllib.request.urlopen('https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml')) -FORM_ITEMS = generate_form(METADATA_SCHEMA) - -@app.route('/') -def send_form(): - """ - Send the file upload form/front page. - """ - - return render_template('form.html', fields=FORM_ITEMS) - -class FileTooBigError(RuntimeError): - """ - Raised when the user gives a file that is too large. - """ - pass - -def copy_with_limit(in_file, out_file, limit=1024*1024): - """ - Copy a file stream, and raise FileTooBigError if the file is too big. - """ - - bytes_used = 0 - buf_size = 65536 - - buf = in_file.read(buf_size) - bytes_used += len(buf) - while buf: - if bytes_used > limit: - raise FileTooBigError('Hit file length limit') - out_file.write(buf) - buf = in_file.read(buf_size) - bytes_used += len(buf) - -def parse_input(input_string, html_type): - """ - Parse an input from the given HTML input type into a useful Python type. - - Raise ValueError if something does not parse. - Raise NotImplementedError if we forgot to implement a type. - """ - - if html_type == 'text': - return input_string - elif html_type == 'number': - return int(input_string) - else: - raise NotImplementedError('Unimplemented input type: {}'.format(html_type)) - -@app.route('/submit', methods=['POST']) -def receive_files(): - """ - Receive the uploaded files. - """ - - # We're going to work in one directory per request - dest_dir = tempfile.mkdtemp() - fasta_dest = os.path.join(dest_dir, 'fasta.fa') - metadata_dest = os.path.join(dest_dir, 'metadata.json') - try: - if 'fasta' not in request.files: - return (render_template('error.html', - error_message="You did not include a FASTA file."), 403) - try: - with open(fasta_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('fasta').stream, out_stream) - except FileTooBigError as e: - # Delegate to the 413 error handler - return handle_large_file(e) - - if request.form.get('metadata_type', None) == 'upload': - if 'metadata' not in request.files: - return (render_template('error.html', - error_message="You did not include a metadata file."), 403) - try: - with open(metadata_dest, 'wb') as out_stream: - copy_with_limit(request.files.get('metadata').stream, out_stream) - except FileTooBigError as e: - # Delegate to the 413 error handler - return handle_large_file(e) - elif request.form.get('metadata_type', None) == 'fill': - # Build a metadata dict - metadata = {} - - for item in FORM_ITEMS: - # Pull all the field values we wanted from the form - if 'heading' in item: - continue - - if item['id'] in request.form and len(request.form[item['id']]) > 0: - # We have this thing. Make a place in the dict tree for it. - parts = item['id'].split('.') - key = parts[-1] - # Remove leading 'metadata' - path = parts[1:-1] - dest_dict = metadata - for parent in path: - if parent not in dest_dict: - dest_dict[parent] = {} - dest_dict = dest_dict[parent] - - try: - # Now finally add the item - dest_dict[key] = parse_input(request.form[item['id']], item['type']) - except ValueError: - # We don't like that input - return (render_template('error.html', - error_message="You provided an unacceptable value for the metadata item {}".format(item['id'])), 403) - elif item['required']: - return (render_template('error.html', - error_message="You omitted the required metadata item {}".format(item['id'])), 403) - - # Now serialize the file with all the items - with open(metadata_dest, 'w') as out_stream: - yaml.dump(metadata, out_stream) - else: - return (render_template('error.html', - error_message="You did not include metadata."), 403) - - # Try and upload files to Arvados using the sequence uploader CLI - result = subprocess.run(['bh20-seq-uploader', fasta_dest, metadata_dest], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - if result.returncode != 0: - # It didn't work. Complain. - error_message="Upload failed. Uploader returned {} and said:\n{}".format(result.returncode, result.stderr) - return (render_template('error.html', error_message=error_message), 403) - else: - # It worked. Say so. - return render_template('success.html', log=result.stdout.decode('utf-8', errors='replace')) - finally: - shutil.rmtree(dest_dir) diff --git a/templates/error.html b/templates/error.html deleted file mode 100644 index c2ab0a4..0000000 --- a/templates/error.html +++ /dev/null @@ -1,19 +0,0 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Upload Failed</title> - </head> - <body> - <h1>Upload Failed</h1> - <hr> - <p> - Your upload has failed. {{error_message}} - </p> - <p> - <a href="/">Click here to try again.</a> - </p> - <hr> - </body> -</html> diff --git a/templates/form.html b/templates/form.html deleted file mode 100644 index 4ad41e2..0000000 --- a/templates/form.html +++ /dev/null @@ -1,95 +0,0 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</title> - </head> - <body> - <h1>Simple Web Uploader for Public SARS-CoV-2 Sequence Resource</h1> - <hr> - <p> - This tool can be used to upload sequenced genomes of SARS-CoV-2 samples to the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. Your uploaded sequence will automatically be processed and incorporated into the public pangenome. - </p> - <hr> - <form action="/submit" method="POST" enctype="multipart/form-data" id="main_form"> - <label for="fasta">Select FASTA file for assembled genome (max 1MB):</label> - <br> - <input type="file" id="fasta" name="fasta" accept=".fa,.fasta,.fna" required> - <br> - - <label>Select metadata submission method:</label> - <br> - <input type="radio" id="metadata_upload" name="metadata_type" value="upload" onchange="setMode()" checked required> - <label for="metadata_upload">Upload metadata file</label> - <br> - <input type="radio" id="metadata_form" name="metadata_type" value="fill" onchange="setMode()" required> - <label for="metadata_form">Fill in metadata manually</label> - <br> - - <div id="metadata_upload_form_spot"> - <div id="metadata_upload_form"> - <label for="metadata">Select JSON or YAML metadata file following <a href="https://github.com/arvados/bh20-seq-resource/blob/master/bh20sequploader/bh20seq-schema.yml" target="_blank">this schema</a> (<a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/metadata.yaml" target="_blank">Example 1</a>, <a href="https://github.com/arvados/bh20-seq-resource/blob/master/example/minimal_example.yaml" target="_blank">Example 2</a>, max 1MB):</label> - <br> - <input type="file" id="metadata" name="metadata" accept=".json,.yml,.yaml" required> - <br> - </div> - </div> - - <div id="metadata_fill_form_spot"> - <div id="metadata_fill_form"> - {% for record in fields %} - {% if 'heading' in record %} - <h4>{{ record['heading'] }}</h4> - {% else %} - <label for="{{ record['id'] }}"> - {{ record['label'] }} - {{ "*" if record['required'] else "" }} - {% if 'ref_url' in record %} - <a href="{{ record['ref_url'] }}" title="More Info" target="_blank">?</a> - {% endif %} - </label> - <br> - <input type="{{ record['type'] }}" id="{{ record['id'] }}" name="{{ record['id'] }}" {{ "required" if record['required'] else "" }}> - <br> - {% endif %} - {% endfor %} - </div> - </div> - - <input type="submit" value="Add to Pangenome"> - </form> - <hr> - <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> - <script type="text/javascript"> - let uploadForm = document.getElementById('metadata_upload_form') - let uploadFormSpot = document.getElementById('metadata_upload_form_spot') - let fillForm = document.getElementById('metadata_fill_form') - let fillFormSpot = document.getElementById('metadata_fill_form_spot') - - function setUploadMode() { - // Make the upload form the one in use - uploadFormSpot.appendChild(uploadForm) - fillFormSpot.removeChild(fillForm) - } - - function setFillMode() { - // Make the fillable form the one in use - uploadFormSpot.removeChild(uploadForm) - fillFormSpot.appendChild(fillForm) - } - - function setMode() { - // Pick mode based on radio - if (document.getElementById('metadata_upload').checked) { - setUploadMode() - } else { - setFillMode() - } - } - - // Start in mode appropriate to selected form item - setMode() - </script> - </body> -</html> diff --git a/templates/success.html b/templates/success.html deleted file mode 100644 index 1be7861..0000000 --- a/templates/success.html +++ /dev/null @@ -1,24 +0,0 @@ -<!DOCTYPE html> -<html> - <head> - <meta charset="UTF-8"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <title>Upload Successful</title> - </head> - <body> - <h1>Upload Successful</h1> - <hr> - <p> - Your files have been uploaded. They should soon appear as part of the <a href="https://workbench.lugli.arvadosapi.com/collections/lugli-4zz18-z513nlpqm03hpca">Public SARS-CoV-2 Sequence Resource</a>. - </p> - <p> - The upload log was: - </p> - <pre>{{log}}</pre> - <hr> - <p> - <a href="/">Click here to upload more files.</a> - </p> - <hr> - </body> -</html> -- cgit v1.2.3 From 2cd6623aa0ddfe4e42b2d434e0523773bb3536ef Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 15:52:23 -0700 Subject: Copy over/combine top-level project components --- Dockerfile | 19 +++++++++++++++++++ README.md | 45 +++++++++++++++++++++++++++++++++++++++++++++ bh20sequploader/main.py | 7 ++----- setup.py | 6 +++++- 4 files changed, 71 insertions(+), 6 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..43fa8f2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +# Dockerfile for containerizing the web interface +FROM python:3.6-jessie +WORKDIR /app + +RUN pip3 install gunicorn + +ADD LICENSE /app/ +ADD gittaggers.py /app/ +ADD setup.py /app/ +ADD README.md /app/ +ADD example /app/example +ADD bh20seqanalyzer /app/bh20simplewebuploader +ADD bh20sequploader /app/bh20sequploader +ADD bh20simplewebuploader /app/bh20simplewebuploader + +RUN pip3 install -e . + +ENV PORT 8080 +CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:8080", "bh20simplewebuploader.main:app"] diff --git a/README.md b/README.md index a6fe052..4667310 100644 --- a/README.md +++ b/README.md @@ -159,4 +159,49 @@ odgi viz -i seqs.odgi -o seqs.png -x 4000 -y 500 -R -P 5 For more information on building pangenome models, [see this wiki page](https://github.com/virtual-biohackathons/covid-19-bh20/wiki/Pangenome#pangenome-model-from-available-genomes). +# Web Interface + +This project comes with a simple web server that lets you use the sequence uploader from a browser. It will work as long as you install the packager with the `web` extra. + +To run it locally: + +``` +virtualenv --python python3 venv +. venv/bin/activate +pip install -e .[web] +env FLASK_APP=bh20simplewebuploader/main.py flask run +``` + +Then visit [http://127.0.0.1:5000/](http://127.0.0.1:5000/). + +## Production + +For production deployment, you can use [gunicorn](https://flask.palletsprojects.com/en/1.1.x/deploying/wsgi-standalone/#gunicorn): + +``` +pip3 install gunicorn +gunicorn bh20simplewebuploader.main:app +``` + +This runs on [http://127.0.0.1:8000/](http://127.0.0.1:8000/) by default, but can be adjusted with various [gunicorn options](http://docs.gunicorn.org/en/latest/run.html#commonly-used-arguments) + +## GNU Guix + +To run the web uploader in a GNU Guix environment + +``` +guix environment guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run +``` + +The containerized version looks like + +``` +guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl +``` + +and + +``` +env FLASK_APP=bh20simplewebuploader/main.py flask run +``` diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 8b8fefe..d3ebc0c 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,7 +6,6 @@ import json import urllib.request import socket import getpass -from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -20,8 +19,6 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - qc_metadata(args.metadata.name) - col = arvados.collection.Collection(api_client=api) print("Reading FASTA") @@ -32,8 +29,8 @@ def main(): f.write(r) r = args.sequence.read(65536) - print("Reading metadata") - with col.open("metadata.yaml", "w") as f: + print("Reading JSONLD") + with col.open("metadata.jsonld", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: diff --git a/setup.py b/setup.py index 48c25aa..41ace7b 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ except ImportError: tagger = egg_info_cmd.egg_info install_requires = ["arvados-python-client", "schema-salad"] +web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] @@ -29,9 +30,12 @@ setup( author="Peter Amstutz", author_email="peter.amstutz@curii.com", license="Apache 2.0", - packages=["bh20sequploader", "bh20seqanalyzer"], + packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, install_requires=install_requires, + extras_require={ + 'web': web_requires + }, setup_requires=[] + pytest_runner, tests_require=["pytest<5"], entry_points={ -- cgit v1.2.3 From 278ff0b42a49b861060eae0c7eb6112e9658fa4e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 16:08:23 -0700 Subject: Point to new repo for source --- bh20simplewebuploader/templates/form.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20simplewebuploader/templates/form.html b/bh20simplewebuploader/templates/form.html index 4ad41e2..2934a7c 100644 --- a/bh20simplewebuploader/templates/form.html +++ b/bh20simplewebuploader/templates/form.html @@ -60,7 +60,7 @@ <input type="submit" value="Add to Pangenome"> </form> <hr> - <small><a href="https://github.com/adamnovak/bh20-simple-web-uploader">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> + <small><a href="https://github.com/arvados/bh20-seq-resource">Source</a> · Made for <a href="https://github.com/virtual-biohackathons/covid-19-bh20">COVID-19-BH20</a></small> <script type="text/javascript"> let uploadForm = document.getElementById('metadata_upload_form') let uploadFormSpot = document.getElementById('metadata_upload_form_spot') -- cgit v1.2.3 From d53e1e98b800d7dc5720de0b3c14c94452159315 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 9 Apr 2020 16:11:03 -0700 Subject: Move the web uploader GUIX instructions to the GUIX file --- README.md | 18 ------------------ doc/INSTALL.md | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 960472e..d83eaac 100644 --- a/README.md +++ b/README.md @@ -176,23 +176,5 @@ gunicorn bh20simplewebuploader.main:app This runs on [http://127.0.0.1:8000/](http://127.0.0.1:8000/) by default, but can be adjusted with various [gunicorn options](http://docs.gunicorn.org/en/latest/run.html#commonly-used-arguments) -## GNU Guix -To run the web uploader in a GNU Guix environment - -``` -guix environment guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run -``` - -The containerized version looks like - -``` -guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -``` - -and - -``` -env FLASK_APP=bh20simplewebuploader/main.py flask run -``` diff --git a/doc/INSTALL.md b/doc/INSTALL.md index c5c486c..f7fd811 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -29,3 +29,23 @@ arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-clien ```sh ~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py ``` + +### Using the Web Uploader + +To run the web uploader in a GNU Guix environment + +``` +guix environment guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run +``` + +The containerized version looks like + +``` +guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl +``` + +and + +``` +env FLASK_APP=bh20simplewebuploader/main.py flask run +``` -- cgit v1.2.3 From ddabd9390d2b221786ef58a6d85200eecf82ca2f Mon Sep 17 00:00:00 2001 From: lltommy Date: Fri, 10 Apr 2020 15:39:56 +0200 Subject: Updating schema and examples, including URIs --- bh20sequploader/bh20seq-schema.yml | 145 ++++++++++++++++++++++++++++++------- example/metadata.yaml | 4 + example/minimal_example.yaml | 6 +- 3 files changed, 128 insertions(+), 27 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 5c962d1..cf9b015 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -3,6 +3,10 @@ $namespaces: sch: https://schema.org/ efo: http://www.ebi.ac.uk/efo/ obo: http://purl.obolibrary.org/obo/ + sio: http://semanticscience.org/resource/ + edam: http://edamontology.org/ + evs: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# + $graph: - name: hostSchema @@ -12,37 +16,93 @@ $graph: type: string jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0000532 - host_id: string - host_common_name: string? - host_sex: string? - host_age: int? - host_age_unit: string? - host_health_status: string? + host_id: + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + host_common_name: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NOMEN_0000037 + host_sex: + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000047 + host_age: + type: int? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000011 + host_age_unit: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/UO_0000036 + host_health_status: + type: string? + jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688 host_treatment: type: string? jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0000727 - additional_host_information: string? + host_vaccination: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/VO_0000001 + additional_host_information: + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 - name: sampleSchema type: record fields: - collector_name: string - collecting_institution: string - specimen_source: string? - collection_date: string? + collector_name: + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001895 + collecting_institution: + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + specimen_source: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001479 + collection_date: + type: string? + jsonldPredicate: + _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164 collection_location: type: string? jsonldPredicate: - _id: https://schema.org/fromLocation - sample_storage_conditions: string? - additional_collection_information: string? + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + sample_storage_conditions: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001472 + additional_collection_information: + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + sample_id: + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + source_database_accession: + type: string? + jsonldPredicate: + _id: http://edamontology.org/data_2091 - name: virusSchema type: record fields: - virus_species: string? - virus_strain: string? + virus_species: + type: string? + jsonldPredicate: + _id: http://edamontology.org/data_1875 + virus_strain: + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_010055 - name: technologySchema type: record @@ -50,7 +110,7 @@ $graph: sample_sequencing_technology: type: string jsonldPredicate: - _id: http://www.ebi.ac.uk/efo/EFO_0000532 + _id: http://purl.obolibrary.org/obo/OBI_0600047 sequence_assembly_method: type: string? jsonldPredicate: @@ -63,14 +123,42 @@ $graph: - name: submitterSchema type: record fields: - submitter_name: string - submitter_address: string? - originating_lab: string - lab_address: string? - provider_sample_id: string? - submitter_sample_id: string? - authors: string? - submitter_id: string? + submitter_name: + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000116 + submitter_date: + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C94162 + submitter_address: + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000172 + originating_lab: + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37984 + lab_address: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0600047 + provider_sample_id: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37900 + submitter_sample_id: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0001741 + authors: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42781 + submitter_id: + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 - name: MainSchema type: record @@ -81,9 +169,14 @@ $graph: virus: virusSchema? technology: technologySchema submitter: submitterSchema + submission: + type: string + jsonldPredicate: + _id: "@id" + #_type: "@id" sequencefile: doc: The subject (eg the fasta/fastq file) that this metadata describes type: string? jsonldPredicate: _id: "@id" - _type: "@id" + _type: "@id" \ No newline at end of file diff --git a/example/metadata.yaml b/example/metadata.yaml index 41ff93e..a2f6e57 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,3 +1,5 @@ +submission: publicSequenceResource + host: host_id: XX1 host_species: string @@ -10,6 +12,7 @@ host: additional_host_information: string sample: + sample_id: XXX collector_name: XXX collecting_institution: XXX specimen_source: XXX @@ -36,3 +39,4 @@ submitter: submitter_sample_id: string authors: testAuthor submitter_id: X12 + submitter_date: Subdate diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml index 201b080..f312ab7 100644 --- a/example/minimal_example.yaml +++ b/example/minimal_example.yaml @@ -1,8 +1,11 @@ +submission: publicSequenceResource + host: host_id: XX host_species: string sample: + sample_id: XXX collector_name: XXX collecting_institution: XXX @@ -11,4 +14,5 @@ technology: submitter: submitter_name: tester - originating_lab: testLab \ No newline at end of file + originating_lab: testLab + submitter_date: Subdate \ No newline at end of file -- cgit v1.2.3 From bef2a43185f9494398f5d5a8cdb6c5f34352f912 Mon Sep 17 00:00:00 2001 From: Alex Kanitz Date: Fri, 10 Apr 2020 18:27:44 +0200 Subject: validate seq format with magic file --- bh20sequploader/main.py | 29 +++++++++++++++++++++++++---- bh20sequploader/validation/Makefile | 4 ++++ bh20sequploader/validation/formats | 4 ++++ bh20sequploader/validation/formats.mgc | Bin 0 -> 1032 bytes 4 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 bh20sequploader/validation/Makefile create mode 100644 bh20sequploader/validation/formats create mode 100644 bh20sequploader/validation/formats.mgc diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index bf74ea5..1d5b9c3 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -3,6 +3,8 @@ import time import arvados import arvados.collection import json +import magic +from pathlib import Path import urllib.request import socket import getpass @@ -14,7 +16,7 @@ UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa' def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') - parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA') + parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') args = parser.parse_args() @@ -26,10 +28,27 @@ def main(): col = arvados.collection.Collection(api_client=api) - if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): - target = "sequence.fasta" - elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + magic_file = Path(__file__).parent / "validation" / "formats.mgc" + val = magic.Magic(magic_file=magic_file.resolve().as_posix(), + uncompress=False, mime=True) + seq_type = val.from_file(args.sequence.name).lower() + print(f"Sequence type: {seq_type}") + if seq_type == "text/fasta": + # ensure that contains only one entry + entries = 0 + for line in args.sequence: + if line.startswith(">"): + entries += 1 + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + args.sequence.close() + args.sequence = open(args.sequence.name, "r") target = "reads.fastq" + elif seq_type == "text/fastq": + target = "sequence.fasta" + else: + raise ValueError("Sequence file does not look like FASTA or FASTQ") with col.open(target, "w") as f: r = args.sequence.read(65536) @@ -37,6 +56,7 @@ def main(): while r: f.write(r) r = args.sequence.read(65536) + args.sequence.close() print("Reading metadata") with col.open("metadata.yaml", "w") as f: @@ -45,6 +65,7 @@ def main(): while r: f.write(r) r = args.metadata.read(65536) + args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') diff --git a/bh20sequploader/validation/Makefile b/bh20sequploader/validation/Makefile new file mode 100644 index 0000000..1ca13fb --- /dev/null +++ b/bh20sequploader/validation/Makefile @@ -0,0 +1,4 @@ +compile: formats.mgc + +formats.mgc : + file -C -m formats diff --git a/bh20sequploader/validation/formats b/bh20sequploader/validation/formats new file mode 100644 index 0000000..ac804cf --- /dev/null +++ b/bh20sequploader/validation/formats @@ -0,0 +1,4 @@ +0 regex \^\>.+\r?\n([acgtnACGTN]+\r?\n)*[acgtnACGTN]+(\r?\n)?$ FASTA +!:mime text/fasta +0 regex \^@.+\r?\n[acgtnACGTN]*\n\\+.*\n[!-i]*(\r\n)? FASTQ +!:mime text/fastq \ No newline at end of file diff --git a/bh20sequploader/validation/formats.mgc b/bh20sequploader/validation/formats.mgc new file mode 100644 index 0000000..bff282a Binary files /dev/null and b/bh20sequploader/validation/formats.mgc differ -- cgit v1.2.3 From 42bef034915c84d3e3ee71d018a37f4292da5504 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 11:48:24 -0500 Subject: Cleaned up Guix install instructions --- doc/INSTALL.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/doc/INSTALL.md b/doc/INSTALL.md index f7fd811..5e9e7e9 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -4,12 +4,14 @@ Other options for running this tool. ## GNU Guix -Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you even when you don't have root access on an Ubuntu system. +### Running the CLI uploader -1. **Set up and enter a container with the necessary dependencies.** After installing Guix as `~/opt/guix/bin/guix`, run: +Another way to install this tool is inside a [GNU Guix Environment](https://guix.gnu.org/manual/en/html_node/Invoking-guix-environment.html), which can handle installing dependencies for you + +1. **Set up and enter a Guix environment with the necessary dependencies.** After installing Guix run: ```sh -~/opt/guix/bin/guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs +guix environment -C guix --ad-hoc git python openssl python-pycurl nss-certs ``` 2. **Install the tool.** From there you can follow the [user installation instructions](#installation-with-pip3---user). In brief: @@ -27,7 +29,7 @@ arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-clien 3. Run the tool directly with ```sh -~/opt/guix/bin/guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py +guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py ``` ### Using the Web Uploader @@ -49,3 +51,5 @@ and ``` env FLASK_APP=bh20simplewebuploader/main.py flask run ``` + +WIP: add gunicorn container -- cgit v1.2.3 From 9b7fbc52ae229d72e75de9f433eea00ce37ba70a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 11:52:18 -0500 Subject: Fixed TypeError: 'module' object is not callable --- bh20sequploader/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index bf74ea5..ede9f38 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,7 +6,7 @@ import json import urllib.request import socket import getpass -import qc_metadata +from qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' -- cgit v1.2.3 From 686bb7e2b3e3bdb326553f2094a07d4f987e303b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 12:09:57 -0500 Subject: This fixed {'type': 'string?', 'jsonldPredicate': {'_id': 'http://purl.obolibrary.org/obo/UO_0000036'}} {'type': 'string?', 'jsonldPredicate': 'http://purl.obolibrary.org/obo/NCIT_C25688'} Traceback (most recent call last): (...) File "/export/iwrk/opensource/code/vg/bh20-seq-resource/bh20simplewebuploader/main.py", line 106, in walk_fields for item in walk_fields(field_type, parent_keys + [field_name], subtree_optional or optional): File "/export/iwrk/opensource/code/vg/bh20-seq-resource/bh20simplewebuploader/main.py", line 92, in walk_fields ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) AttributeError: 'str' object has no attribute 'get' --- bh20simplewebuploader/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bh20simplewebuploader/main.py b/bh20simplewebuploader/main.py index b4e8681..bfc7762 100644 --- a/bh20simplewebuploader/main.py +++ b/bh20simplewebuploader/main.py @@ -88,7 +88,11 @@ def generate_form(schema): if not isinstance(field_type, str): # If the type isn't a string # See if it has a more info/what goes here URL - ref_url = field_type.get('jsonldPredicate', {}).get('_id', None) + predicate = field_type.get('jsonldPredicate', {}) + if not isinstance(predicate, str): + ref_url = predicate.get('_id', None) + else: + ref_url = predicate # not sure this is correct # Grab out its type field field_type = field_type.get('type', '') -- cgit v1.2.3 From f27a6a3e3be2446660e2f59c1106dcaba32971cf Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 12:13:00 -0500 Subject: Note on running container with Guix --- doc/INSTALL.md | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/doc/INSTALL.md b/doc/INSTALL.md index 5e9e7e9..bde9dd2 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -34,22 +34,16 @@ guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- pyt ### Using the Web Uploader -To run the web uploader in a GNU Guix environment +To run the web uploader in a GNU Guix environment/container ``` -guix environment guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run -``` - -The containerized version looks like - -``` -guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -``` - -and - -``` -env FLASK_APP=bh20simplewebuploader/main.py flask run +guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run + * Serving Flask app "bh20simplewebuploader/main.py" + * Environment: production + WARNING: This is a development server. Do not use it in a production deployment. + Use a production WSGI server instead. + * Debug mode: off + * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit) ``` WIP: add gunicorn container -- cgit v1.2.3 From 9e44ae814862ea91456d95b31981c16ecae6d317 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 12:34:42 -0500 Subject: Magic dependency --- doc/INSTALL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/INSTALL.md b/doc/INSTALL.md index bde9dd2..d8d7f3e 100644 --- a/doc/INSTALL.md +++ b/doc/INSTALL.md @@ -29,7 +29,7 @@ arvados-python-client-2.0.1 ciso8601-2.1.3 future-0.18.2 google-api-python-clien 3. Run the tool directly with ```sh -guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- python3 bh20sequploader/main.py +guix environment guix --ad-hoc git python openssl python-pycurl python-magic nss-certs -- python3 bh20sequploader/main.py example/sequence.fasta example/metadata.yaml ``` ### Using the Web Uploader @@ -37,7 +37,7 @@ guix environment guix --ad-hoc git python openssl python-pycurl nss-certs -- pyt To run the web uploader in a GNU Guix environment/container ``` -guix environment -C guix --ad-hoc git python python-flask python-pyyaml nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run +guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-magic nss-certs --network openssl -- env FLASK_APP=bh20simplewebuploader/main.py flask run * Serving Flask app "bh20simplewebuploader/main.py" * Environment: production WARNING: This is a development server. Do not use it in a production deployment. -- cgit v1.2.3 From a6ba9a5203a568611a94c043fd13e2ec50f071da Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 15:12:08 -0400 Subject: Validate & propagate metadata --- bh20seqanalyzer/main.py | 45 ++++++++++++++++++++++++++++++++------ bh20sequploader/bh20seq-schema.yml | 6 ++--- bh20sequploader/main.py | 2 +- bh20sequploader/qc_metadata.py | 29 +++++++++++++++++++++++- 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 2030c1e..1fb51b5 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -8,6 +8,8 @@ import json import logging import ruamel.yaml from bh20sequploader.qc_metadata import qc_metadata +import pkg_resources +from schema_salad.sourceline import add_lc_filename logging.basicConfig(format="[%(asctime)s] %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) @@ -24,8 +26,14 @@ def validate_upload(api, collection, validated_project, logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: - metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) - #valid = qc_metadata(metadata_content) and valid + try: + metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) + metadata_content["id"] = "keep:%s/metadata.yaml" % collection["portable_data_hash"] + add_lc_filename(metadata_content, metadata_content["id"]) + valid = qc_metadata(metadata_content) and valid + except Exception as e: + logging.warn(e) + valid = False if not valid: logging.warn("Failed metadata qc") @@ -52,9 +60,10 @@ def validate_upload(api, collection, validated_project, "owner_uuid": validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute() else: + pass # It is invalid, delete it. - logging.warn("Deleting '%s'" % collection["name"]) - api.collections().delete(uuid=collection["uuid"]).execute() + #logging.warn("Deleting '%s'" % collection["name"]) + #api.collections().delete(uuid=collection["uuid"]).execute() return valid @@ -107,12 +116,17 @@ def start_fastq_to_fasta(api, collection, def start_pangenome_analysis(api, analysis_project, pangenome_workflow_uuid, - validated_project): + validated_project, + schema_ref): validated = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", validated_project]]) inputobj = { "inputReads": [], "metadata": [], - "subjects": [] + "subjects": [], + "metadataSchema": { + "class": "File", + "location": schema_ref + } } for v in validated: inputobj["inputReads"].append({ @@ -166,12 +180,26 @@ def move_fastq_to_fasta_results(api, analysis_project, uploader_project): api.groups().update(uuid=p["uuid"], body={"properties": p["properties"]}).execute() +def upload_schema(api, workflow_def_project): + schema_resource = pkg_resources.resource_stream('bh20sequploader.qc_metadata', "bh20seq-schema.yml") + c = arvados.collection.Collection() + with c.open("schema.yml", "wb") as f: + f.write(schema_resource.read()) + pdh = c.portable_data_hash() + wd = api.collections().list(filters=[["owner_uuid", "=", workflow_def_project], + ["portable_data_hash", "=", pdh]]).execute() + if len(wd["items"]) == 0: + c.save_new(owner_uuid=workflow_def_project, name="Metadata schema", ensure_unique_name=True) + return "keep:%s/schema.yml" % pdh + + def main(): parser = argparse.ArgumentParser(description='Analyze collections uploaded to a project') parser.add_argument('--uploader-project', type=str, default='lugli-j7d0g-n5clictpuvwk8aa', help='') parser.add_argument('--pangenome-analysis-project', type=str, default='lugli-j7d0g-y4k4uswcqi3ku56', help='') parser.add_argument('--fastq-project', type=str, default='lugli-j7d0g-xcjxp4oox2u1w8u', help='') parser.add_argument('--validated-project', type=str, default='lugli-j7d0g-5ct8p1i1wrgyjvp', help='') + parser.add_argument('--workflow-def-project', type=str, default='lugli-j7d0g-5hswinmpyho8dju', help='') parser.add_argument('--pangenome-workflow-uuid', type=str, default='lugli-7fd4e-mqfu9y3ofnpnho1', help='') parser.add_argument('--fastq-workflow-uuid', type=str, default='lugli-7fd4e-2zp9q4jo5xpif9y', help='') @@ -183,6 +211,8 @@ def main(): logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) + schema_ref = upload_schema(api, args.workflow_def_project) + while True: move_fastq_to_fasta_results(api, args.fastq_project, args.uploader_project) @@ -198,7 +228,8 @@ def main(): start_pangenome_analysis(api, args.pangenome_analysis_project, args.pangenome_workflow_uuid, - args.validated_project) + args.validated_project, + schema_ref) copy_most_recent_result(api, args.pangenome_analysis_project, diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index cf9b015..a072bd7 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -174,9 +174,9 @@ $graph: jsonldPredicate: _id: "@id" #_type: "@id" - sequencefile: - doc: The subject (eg the fasta/fastq file) that this metadata describes + id: + doc: The subject (eg the fasta/fastq file) that the metadata describes type: string? jsonldPredicate: _id: "@id" - _type: "@id" \ No newline at end of file + _type: "@id" diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index bfb8c51..2032508 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -22,7 +22,7 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - if not qc_metadata(args.metadata.name): + if not bh20sequploader.qc_metadata.qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index ebe4dfc..38edcaa 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,12 +1,25 @@ import schema_salad.schema +import schema_salad.ref_resolver import logging import pkg_resources import logging +import traceback + +class CustomFetcher(schema_salad.ref_resolver.DefaultFetcher): + def check_exists(sup, url): + if url.startswith("keep:"): + return True + else: + return super().check_exists(url) + + def supported_schemes(self): # type: () -> List[str] + return ["file", "http", "https", "mailto", "keep"] + def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} - (document_loader, + (loader, avsc_names, schema_metadata, metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) @@ -15,9 +28,23 @@ def qc_metadata(metadatafile): print(avsc_names) return False + document_loader = schema_salad.ref_resolver.Loader( + loader.ctx, + schemagraph=loader.graph, + foreign_properties=loader.foreign_properties, + idx=loader.idx, + cache=loader.cache, + fetcher_constructor=CustomFetcher, + skip_schemas=loader.skip_schemas, + url_fields=loader.url_fields, + allow_attachments=loader.allow_attachments, + session=loader.session, + ) + try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True except Exception as e: + traceback.print_exc() logging.warn(e) return False -- cgit v1.2.3 From 925058d0b3db70803d322cc2a33801240899a20a Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 15:52:37 -0400 Subject: Fix up fasta/fastq validation --- bh20seqanalyzer/main.py | 9 ++++++++- bh20sequploader/main.py | 29 +++++------------------------ setup.py | 5 +++-- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 1fb51b5..c05b402 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -8,6 +8,7 @@ import json import logging import ruamel.yaml from bh20sequploader.qc_metadata import qc_metadata +from bh20sequploader.qc_fasta import qc_fasta import pkg_resources from schema_salad.sourceline import add_lc_filename @@ -38,7 +39,13 @@ def validate_upload(api, collection, validated_project, logging.warn("Failed metadata qc") if valid: - if "sequence.fasta" not in col: + if "sequence.fasta" in col: + try: + qc_fasta(col.open("sequence.fasta")) + except Exception as e: + logging.warn(e) + valid = False + else: if "reads.fastq" in col: start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) return False diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 2032508..4a225f6 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -8,7 +8,8 @@ from pathlib import Path import urllib.request import socket import getpass -from qc_metadata import qc_metadata +from .qc_metadata import qc_metadata +from .qc_fasta import qc_fasta ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -22,34 +23,14 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) - if not bh20sequploader.qc_metadata.qc_metadata(args.metadata.name): + target = qc_fasta(args.sequence) + + if not qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) col = arvados.collection.Collection(api_client=api) - magic_file = Path(__file__).parent / "validation" / "formats.mgc" - val = magic.Magic(magic_file=magic_file.resolve().as_posix(), - uncompress=False, mime=True) - seq_type = val.from_file(args.sequence.name).lower() - print(f"Sequence type: {seq_type}") - if seq_type == "text/fasta": - # ensure that contains only one entry - entries = 0 - for line in args.sequence: - if line.startswith(">"): - entries += 1 - if entries > 1: - raise ValueError("FASTA file contains multiple entries") - break - args.sequence.close() - args.sequence = open(args.sequence.name, "r") - target = "reads.fastq" - elif seq_type == "text/fastq": - target = "sequence.fasta" - else: - raise ValueError("Sequence file does not look like FASTA or FASTQ") - with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) diff --git a/setup.py b/setup.py index 41ace7b..18e858e 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ try: except ImportError: tagger = egg_info_cmd.egg_info -install_requires = ["arvados-python-client", "schema-salad"] +install_requires = ["arvados-python-client", "schema-salad", "python-magic"] web_requires = ["flask", "pyyaml"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) @@ -31,7 +31,8 @@ setup( author_email="peter.amstutz@curii.com", license="Apache 2.0", packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], - package_data={"bh20sequploader": ["bh20seq-schema.yml"]}, + package_data={"bh20sequploader": ["bh20seq-schema.yml", "validation/formats"], + }, install_requires=install_requires, extras_require={ 'web': web_requires -- cgit v1.2.3 From 1f66b8270a7bf06f98e2a336385bc84b778ead66 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 15:53:58 -0400 Subject: Add qc_fasta --- bh20sequploader/qc_fasta.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 bh20sequploader/qc_fasta.py diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py new file mode 100644 index 0000000..e3d4fe7 --- /dev/null +++ b/bh20sequploader/qc_fasta.py @@ -0,0 +1,28 @@ +import pkg_resources +import tempfile +import magic + +def qc_fasta(sequence): + schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(schema_resource.read()) + tmp.flush() + val = magic.Magic(magic_file=tmp.name, + uncompress=False, mime=True) + seq_type = val.from_buffer(sequence.read(4096)).lower() + sequence.seek(0) + if seq_type == "text/fasta": + # ensure that contains only one entry + entries = 0 + for line in sequence: + if line.startswith(">"): + entries += 1 + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + sequence.seek(0) + return "reads.fastq" + elif seq_type == "text/fastq": + return "sequence.fasta" + else: + raise ValueError("Sequence file does not look like FASTA or FASTQ") -- cgit v1.2.3 From 1b1283131f3c684bfff2c1b165565957ac01b4be Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 16:11:29 -0400 Subject: Fix swapped fasta/fastq Replace example sequence with COV19 reference instead of HIV. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com> --- bh20seqanalyzer/main.py | 1 + bh20sequploader/qc_fasta.py | 4 +- example/sequence.fasta | 642 +++++++++++++++++++++++++++++--------------- 3 files changed, 432 insertions(+), 215 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index c05b402..7626662 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -47,6 +47,7 @@ def validate_upload(api, collection, validated_project, valid = False else: if "reads.fastq" in col: + logging.info("Upload '%s' running fastq2fasta", collection["name"]) start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) return False else: diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index e3d4fe7..e47d66b 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -21,8 +21,8 @@ def qc_fasta(sequence): raise ValueError("FASTA file contains multiple entries") break sequence.seek(0) - return "reads.fastq" - elif seq_type == "text/fastq": return "sequence.fasta" + elif seq_type == "text/fastq": + return "reads.fastq" else: raise ValueError("Sequence file does not look like FASTA or FASTQ") diff --git a/example/sequence.fasta b/example/sequence.fasta index 3c4c0ef..b364687 100644 --- a/example/sequence.fasta +++ b/example/sequence.fasta @@ -1,214 +1,430 @@ ->AF324493.2 HIV-1 vector pNL4-3, complete sequence -TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA -CTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTC -AAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAATGAAGGAGAGAACAACAGCTTGTTACACC -CTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGC -ATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAG -GGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGAT -GCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGC -TCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTG -TGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCT -AGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCG -GCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGC -GGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAA -AAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGC -TAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCT -ACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGT -GTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAA -GTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTAT -AGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAA -GTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCC -CACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGAC -CATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAG -ATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGA -CACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGT -AAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTA -GACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCT -TGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGA -AGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATG -AGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTA -AGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTG -GAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATC -TGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAG -AGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTA -TCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAA -AGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAA -ACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATC -TGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGT -TGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCC -AGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATT -TGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTAT -TTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAAC -TCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACA -GTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTA -CCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAA -AGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGAC -ATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAA -AAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGA -ACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCA -GAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTT -ATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACC -ACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTG -TATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAA -TTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAGTATGCAAGAATGAAGGGTGCCCACACTAATGA -TGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCT -AAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGA -TTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCAT -AATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATAT -GTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAG -CAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGG -AATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAA -AAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAAT -TGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAA -ATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTA -GCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATAT -GGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATAT -AGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGA -TGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTT -GGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTAT -GAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAA -ATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAG -TAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCG -GGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGG -GCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATT -ATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGAT -TAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTAC -TAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGG -GGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGAT -ATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGA -ATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAAC -AAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGC -CTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCA -TACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATAT -GGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAAT -AAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACT -CGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAG -CCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAA -AAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGAC -TCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCA -TTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAA -GACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGA -AGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGAT -CTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACC -ACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCT -GTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAA -AAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTA -AAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTA -GCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGA -TAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGG -TTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATAC -ATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATG -TACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAAT -GGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAG -TACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTAT -CCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAAC -ATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATA -ATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGG -AGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACT -GAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACA -TGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATAT -TACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGA -GGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAG -TAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTT -CCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCGTCAATGACGCTGACGGTACAGGCCAGA -CAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGT -TGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGA -TCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCT -AGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTA -ACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGA -ATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATA -AAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGA -ATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAG -GCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCC -TTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCT -TGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCT -CCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTA -GCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAA -GAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTG -GATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCG -AGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTA -GAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACA -AGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAG -AAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACA -CCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATA -AGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGA -CCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTG -CATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAG -GGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTG -CCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACT -GCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGT -AACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCACCCAGGAGGTAGAGGTTGCAG -TGAGCCAAGATCGCGCCACTGCATTCCAGCCTGGGCAAGAAAACAAGACTGTCTAAAATAATAATAATAA -GTTAAGGGTATTAAATATATTTATACATGGAGGTCATAAAAATATATATATTTGGGCTGGGCGCAGTGGC -TCACACCTGCGCCCGGCCCTTTGGGAGGCCGAGGCAGGTGGATCACCTGAGTTTGGGAGTTCCAGACCAG -CCTGACCAACATGGAGAAACCCCTTCTCTGTGTATTTTTAGTAGATTTTATTTTATGTGTATTTTATTCA -CAGGTATTTCTGGAAAACTGAAACTGTTTTTCCTCTACTCTGATACCACAAGAATCATCAGCACAGAGGA -AGACTTCTGTGATCAAATGTGGTGGGAGAGGGAGGTTTTCACCAGCACATGAGCAGTCAGTTCTGCCGCA -GACTCGGCGGGTGTCCTTCGGTTCAGTTCCAACACCGCCTGCCTGGAGAGAGGTCAGACCACAGGGTGAG -GGCTCAGTCCCCAAGACATAAACACCCAAGACATAAACACCCAACAGGTCCACCCCGCCTGCTGCCCAGG -CAGAGCCGATTCACCAAGACGGGAATTAGGATAGAGAAAGAGTAAGTCACACAGAGCCGGCTGTGCGGGA -GAACGGAGTTCTATTATGACTCAAATCAGTCTCCCCAAGCATTCGGGGATCAGAGTTTTTAAGGATAACT -TAGTGTGTAGGGGGCCAGTGAGTTGGAGATGAAAGCGTAGGGAGTCGAAGGTGTCCTTTTGCGCCGAGTC -AGTTCCTGGGTGGGGGCCACAAGATCGGATGAGCCAGTTTATCAATCCGGGGGTGCCAGCTGATCCATGG -AGTGCAGGGTCTGCAAAATATCTCAAGCACTGATTGATCTTAGGTTTTACAATAGTGATGTTACCCCAGG -AACAATTTGGGGAAGGTCAGAATCTTGTAGCCTGTAGCTGCATGACTCCTAAACCATAATTTCTTTTTTG -TTTTTTTTTTTTTATTTTTGAGACAGGGTCTCACTCTGTCACCTAGGCTGGAGTGCAGTGGTGCAATCAC -AGCTCACTGCAGCCTCAACGTCGTAAGCTCAAGCGATCCTCCCACCTCAGCCTGCCTGGTAGCTGAGACT -ACAAGCGACGCCCCAGTTAATTTTTGTATTTTTGGTAGAGGCAGCGTTTTGCCGTGTGGCCCTGGCTGGT -CTCGAACTCCTGGGCTCAAGTGATCCAGCCTCAGCCTCCCAAAGTGCTGGGACAACCGGGGCCAGTCACT -GCACCTGGCCCTAAACCATAATTTCTAATCTTTTGGCTAATTTGTTAGTCCTACAAAGGCAGTCTAGTCC -CCAGGCAAAAAGGGGGTTTGTTTCGGGAAAGGGCTGTTACTGTCTTTGTTTCAAACTATAAACTAAGTTC -CTCCTAAACTTAGTTCGGCCTACACCCAGGAATGAACAAGGAGAGCTTGGAGGTTAGAAGCACGATGGAA -TTGGTTAGGTCAGATCTCTTTCACTGTCTGAGTTATAATTTTGCAATGGTGGTTCAAAGACTGCCCGCTT -CTGACACCAGTCGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCT -TCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAA -AGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCA -AAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCAT -CACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCC -CTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCC -TTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCC -AAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTG -AGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAG -GTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTT -GGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAA -CCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGA -AGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTC -ATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAA -GTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTG -TCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCA -TCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACC -AGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTG -TTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGC -ATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTA -CATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTT -GGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGA -TGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCT -CTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAA -ACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGT -GCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAA -ATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTA -TTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAA -ATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACAT -TAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCT -CTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGT -CAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTAC -TGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCA -TTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGGGG -AGGCAGAGATTGCAGTAAGCTGAGATCGCAGCACTGCACTCCAGCCTGGGCGACAGAGTAAGACTCTGTC -TCAAAAATAAAATAAATAAATCAATCAGATATTCCAATCTTTTCCTTTATTTATTTATTTATTTTCTATT -TTGGAAACACAGTCCTTCCTTATTCCAGAATTACACATATATTCTATTTTTCTTTATATGCTCCAGTTTT -TTTTAGACCTTCACCTGAAATGTGTGTATACAAAATCTAGGCCAGTCCAGCAGAGCCTAAAGGTAAAAAA -TAAAATAATAAAAAATAAATAAAATCTAGCTCACTCCTTCACATCAAAATGGAGATACAGCTGTTAGCAT -TAAATACCAAATAACCCATCTTGTCCTCAATAATTTTAAGCGCCTCTCTCCACCACATCTAACTCCTGTC -AAAGGCATGTGCCCCTTCCGGGCGCTCTGCTGTGCTGCCAACCAACTGGCATGTGGACTCTGCAGGGTCC -CTAACTGCCAAGCCCCACAGTGTGCCCTGAGGCTGCCCCTTCCTTCTAGCGGCTGCCCCCACTCGGCTTT -GCTTTCCCTAGTTTCAGTTACTTGCGTTCAGCCAAGGTCTGAAACTAGGTGCGCACAGAGCGGTAAGACT -GCGAGAGAAAGAGACCAGCTTTACAGGGGGTTTATCACAGTGCACCCTGACAGTCGTCAGCCTCACAGGG -GGTTTATCACATTGCACCCTGACAGTCGTCAGCCTCACAGGGGGTTTATCACAGTGCACCCTTACAATCA -TTCCATTTGATTCACAATTTTTTTAGTCTCTACTGTGCCTAACTTGTAAGTTAAATTTGATCAGAGGTGT -GTTCCCAGAGGGGAAAACAGTATATACAGGGTTCAGTACTATCGCATTTCAGGCCTCCACCTGGGTCTTG -GAATGTGTCCCCCGAGGGGTGATGACTACCTCAGTTGGATCTCCACAGGTCACAGTGACACAAGATAACC -AAGACACCTCCCAAGGCTACCACAATGGGCCGCCCTCCACGTGCACATGGCCGGAGGAACTGCCATGTCG -GAGGTGCAAGCACACCTGCGCATCAGAGTCCTTGGTGTGGAGGGAGGGACCAGCGCAGCTTCCAGCCATC -CACCTGATGAACAGAACCTAGGGAAAGCCCCAGTTCTACTTACACCAGGAAAGGC +>NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA +CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC +TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG +TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC +CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC +GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG +CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT +GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC +GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT +TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA +GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG +TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG +CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG +TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG +CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA +ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA +CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC +CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA +GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT +ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG +GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG +CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA +CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA +ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT +TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG +GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG +TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC +GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG +ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG +GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT +AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA +TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT +AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA +GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC +TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT +AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA +GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT +ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA +GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT +GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA +ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA +TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG +AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT +TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA +CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC +AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA +GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA +CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG +TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT +GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT +TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA +TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT +GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA +AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC +TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA +GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG +ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT +GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT +GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA +TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC +AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA +TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA +CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA +TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT +TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG +AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA +ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC +CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA +AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT +AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG +ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA +TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA +ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA +ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT +GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT +GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT +TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC +ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT +GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG +ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG +TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG +TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC +CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA +CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT +GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC +CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG +TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA +ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA +AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA +TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT +ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG +CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA +TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG +CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA +TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC +TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG +GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT +TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA +TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT +ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC +TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC +TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT +GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG +GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT +GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA +GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA +TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC +AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT +GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT +AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT +AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG +AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT +TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT +ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG +GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG +AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG +CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT +TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA +ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA +ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC +ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC +ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC +CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT +TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT +GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC +TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC +AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA +GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG +GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA +CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC +CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT +ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT +CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG +TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG +CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC +GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC +TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC +TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT +GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG +ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG +ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG +TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT +ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG +TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT +GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA +CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT +TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA +ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG +CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA +TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA +ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC +AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC +TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC +ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA +TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT +AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG +ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT +CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG +TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC +TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG +GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA +GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC +TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA +AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA +GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA +GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA +TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA +GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG +AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA +TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA +ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT +TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG +TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT +GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA +CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT +TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC +TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT +ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT +ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT +GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT +GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG +TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA +AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG +TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA +CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA +ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT +GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC +CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA +ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC +ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG +ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT +AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA +GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG +TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT +TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA +AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG +ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG +ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG +CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT +ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC +AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT +GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC +AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG +ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT +AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC +TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC +TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG +GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG +TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT +AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA +GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC +AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC +ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC +AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC +ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT +GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA +TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC +AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC +GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA +TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA +CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT +AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG +TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT +ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT +GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT +GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA +TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT +GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG +AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT +TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC +GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT +AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC +TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG +GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT +GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT +AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT +TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA +AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC +ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT +TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC +TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA +GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC +AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA +TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC +TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA +ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT +GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC +TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA +CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT +CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA +GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG +TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA +TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA +GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC +TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT +TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA +TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA +TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT +AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA +AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA +CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT +GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG +TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG +AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC +ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC +CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT +AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT +TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA +ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG +ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA +GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA +AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA +CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA +CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT +TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC +CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA +AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT +AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA +CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA +CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT +ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG +ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA +AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT +ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC +GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA +TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC +GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT +TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA +ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT +AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG +GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG +GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG +GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA +AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG +GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA +CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA +ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA +GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG +TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC +CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT +TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC +GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC +AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA +TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT +TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA +GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA +ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT +GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT +GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG +TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC +ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT +GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG +ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC +TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA +GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT +TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT +TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC +AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC +TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA +GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC +CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA +CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT +CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG +GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC +CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA +ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA +TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC +AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA +TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT +GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC +TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG +ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG +GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA +AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA +CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA +TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG +TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT +ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA +TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA +GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT +TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA +CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC +TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA +GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG +CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT +ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG +ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA +ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC +GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT +TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT +GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG +GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT +AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT +CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA +CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG +GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA +ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC +CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT +TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC +TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT +TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT +GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT +CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA +TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC +CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA +AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT +AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC +ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT +GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA +GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG +ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG +CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC +TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA +AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC +CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA +GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA +TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT +TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT +GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT +ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG +CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA +GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG +TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC +GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA +TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT +GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA +AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG +ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG +TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT +GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC +CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT +GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA +AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC +ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT +AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA +ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG +TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG +CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC +AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA +ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG +TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC +TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC +TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT +TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG +CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT +GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT +TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC +GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT +TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAA -- cgit v1.2.3 From f2a3aeb6d7d8ba210d060b83ea46475eab3626ac Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 16:20:13 -0400 Subject: Improve fasta/fastq QC --- bh20seqanalyzer/main.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 7626662..63ff067 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -39,20 +39,21 @@ def validate_upload(api, collection, validated_project, logging.warn("Failed metadata qc") if valid: - if "sequence.fasta" in col: - try: - qc_fasta(col.open("sequence.fasta")) - except Exception as e: - logging.warn(e) - valid = False - else: - if "reads.fastq" in col: - logging.info("Upload '%s' running fastq2fasta", collection["name"]) - start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) - return False - else: - valid = False - logging.warn("Upload '%s' missing sequence.fasta", collection["name"]) + tgt = None + for n in ("sequence.fasta", "reads.fastq"): + if n not in col: + continue + with col.open(n) as qf: + tgt = qc_fasta(qf) + if tgt != n: + logging.info("Expected %s but magic says it should be %s", n, tgt) + valid = False + elif tgt == "reads.fastq": + start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid) + return False + if tgt is None: + valid = False + logging.warn("Upload '%s' does not contain sequence.fasta or reads.fastq", collection["name"]) dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() -- cgit v1.2.3 From ffad8c21f08aca1cf65809c398f3613846e7c8ba Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 10 Apr 2020 16:47:31 -0400 Subject: Propagating metadata to output works now. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com> --- bh20seqanalyzer/main.py | 4 ++-- bh20sequploader/bh20seq-schema.yml | 5 ----- bh20sequploader/qc_metadata.py | 26 +------------------------- example/metadata.yaml | 2 -- 4 files changed, 3 insertions(+), 34 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 63ff067..193a268 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -29,7 +29,7 @@ def validate_upload(api, collection, validated_project, else: try: metadata_content = ruamel.yaml.round_trip_load(col.open("metadata.yaml")) - metadata_content["id"] = "keep:%s/metadata.yaml" % collection["portable_data_hash"] + metadata_content["id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection["portable_data_hash"] add_lc_filename(metadata_content, metadata_content["id"]) valid = qc_metadata(metadata_content) and valid except Exception as e: @@ -146,7 +146,7 @@ def start_pangenome_analysis(api, "class": "File", "location": "keep:%s/metadata.yaml" % v["portable_data_hash"] }) - inputobj["subjects"].append("keep:%s/sequence.fasta" % v["portable_data_hash"]) + inputobj["subjects"].append("http://arvados.org/keep:%s/sequence.fasta" % v["portable_data_hash"]) run_workflow(api, analysis_project, pangenome_workflow_uuid, "Pangenome analysis", inputobj) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index a072bd7..8a22db1 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -169,11 +169,6 @@ $graph: virus: virusSchema? technology: technologySchema submitter: submitterSchema - submission: - type: string - jsonldPredicate: - _id: "@id" - #_type: "@id" id: doc: The subject (eg the fasta/fastq file) that the metadata describes type: string? diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 38edcaa..e477f21 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -5,21 +5,10 @@ import pkg_resources import logging import traceback -class CustomFetcher(schema_salad.ref_resolver.DefaultFetcher): - def check_exists(sup, url): - if url.startswith("keep:"): - return True - else: - return super().check_exists(url) - - def supported_schemes(self): # type: () -> List[str] - return ["file", "http", "https", "mailto", "keep"] - - def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} - (loader, + (document_loader, avsc_names, schema_metadata, metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) @@ -28,19 +17,6 @@ def qc_metadata(metadatafile): print(avsc_names) return False - document_loader = schema_salad.ref_resolver.Loader( - loader.ctx, - schemagraph=loader.graph, - foreign_properties=loader.foreign_properties, - idx=loader.idx, - cache=loader.cache, - fetcher_constructor=CustomFetcher, - skip_schemas=loader.skip_schemas, - url_fields=loader.url_fields, - allow_attachments=loader.allow_attachments, - session=loader.session, - ) - try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True diff --git a/example/metadata.yaml b/example/metadata.yaml index a2f6e57..c780921 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,5 +1,3 @@ -submission: publicSequenceResource - host: host_id: XX1 host_species: string -- cgit v1.2.3 From fc872f15da426926414fb7629bf6660d9880ed1e Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 10 Apr 2020 17:16:35 -0500 Subject: Draft --- paper/paper.bib | 16 ++++++ paper/paper.md | 160 +++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 151 insertions(+), 25 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index e69de29..bcb9c0b 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -0,0 +1,16 @@ +@book{CWL, +title = "Common Workflow Language, v1.0", +abstract = "The Common Workflow Language (CWL) is an informal, multi-vendor working group consisting of various organizations and individuals that have an interest in portability of data analysis workflows. Our goal is to create specifications that enable data scientists to describe analysis tools and workflows that are powerful, easy to use, portable, and support reproducibility.CWL builds on technologies such as JSON-LD and Avro for data modeling and Docker for portable runtime environments. CWL is designed to express workflows for data-intensive science, such as Bioinformatics, Medical Imaging, Chemistry, Physics, and Astronomy.This is v1.0 of the CWL tool and workflow specification, released on 2016-07-08", +keywords = "cwl, workflow, specification", +author = "Brad Chapman and John Chilton and Michael Heuer and Andrey Kartashov and Dan Leehr and Herv{\'e} M{\'e}nager and Maya Nedeljkovich and Matt Scales and Stian Soiland-Reyes and Luka Stojanovic", +editor = "Peter Amstutz and Crusoe, {Michael R.} and Nebojša Tijanić", +note = "Specification, product of the Common Workflow Language working group. http://www.commonwl.org/v1.0/", +year = "2016", +month = "7", +day = "8", +doi = "10.6084/m9.figshare.3115156.v2", +language = "English", +publisher = "figshare", +address = "United States", + +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index caa9903..813c91b 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,8 +1,9 @@ --- -title: 'Public Sequence Resource for COVID-19' +title: 'CPSR: COVID-19 Public Sequence Resource' +title_short: 'CPSR: COVID-19 Public Sequence Resource' tags: - Sequencing - - COVID + - COVID-19 authors: - name: Pjotr Prins orcid: 0000-0002-8021-9162 @@ -25,16 +26,30 @@ authors: - name: Rutger Vos orcid: 0000 affiliation: 7 - - Michael Heuer + - name: Michael Heuer orcid: 0000 affiliation: 8 - + - name: Adam Novak + orcid: 0000 + affiliation: 9 + - name: Alex Kanitz + orcid: 0000 + affiliation: 10 + - name: Jerven Bolleman + orcid: 0000 + affiliation: 11 + - name: Joep de Ligt + orcid: 0000 + affiliation: 12 affiliations: - name: Department of Genetics, Genomics and Informatics, The University of Tennessee Health Science Center, Memphis, TN, USA. index: 1 - name: Curii, Boston, USA index: 2 date: 11 April 2020 +event: COVID2020 +group: Public Sequence Uploader +authors_short: Pjotr Prins & Peter Amstutz \emph{et al.} bibliography: paper.bib --- @@ -49,13 +64,48 @@ pasting above link (or yours) with https://github.com/biohackrxiv/bhxiv-gen-pdf +Note that author order will change! + --> # Introduction -As part of the one week COVID-19 Biohackathion 2020, we formed a -working group on creating a public sequence resource for Corona virus. - +As part of the COVID-19 Biohackathion 2020 we formed a working +group to create a COVID-19 Public Sequence Resource (CPSR) for +Corona virus sequences. The general idea was to create a +repository that has a low barrier to entry for uploading sequence +data using best practices. I.e., data published with a creative +commons 4.0 (CC-4.0) license with metadata using state-of-the art +standards and, perhaps most importantly, providing standardized +workflows that get triggered on upload, so that results are +immediately available in standardized data formats. + +Existing data repositories for viral data include GISAID, EBI ENA +and NCBI. These repositories allow for free sharing of data, but +do not add value in terms of running immediate +computations. Also, GISAID, at this point, has the most complete +collection of genetic sequence data of influenza viruses and +related clinical and epidemiological data through its +database. But, due to a restricted license, data submitted to +GISAID can not be used for online web services and on-the-fly +computation. In addition GISAID registration which can take weeks +and, painfully, forces users to download sequences one at a time +to do any type of analysis. In our opinion this does not fit a +pandemic scenario where fast turnaround times are key and data +analysis has to be agile. + +We managed to create a useful sequence uploader utility within +one week by leveraging existing technologies, such as the Arvados +Cloud platform [@Arvados], the Common Workflow Langauge (CWL) +[@CWL], Docker images built with Debian packages, and the many +free and open source software packages that are available for +bioinformatics. + +The source code for the CLI uploader and web uploader can be +found [here](https://github.com/arvados/bh20-seq-resource) +(FIXME: we'll have a full page). The CWL workflow definitions can +be found [here](https://github.com/hpobio-lab/viral-analysis) and +on CWL hub (FIXME). <!-- @@ -73,38 +123,98 @@ working group on creating a public sequence resource for Corona virus. ## Cloud computing backend -Peter, Pjotr, MichaelC - -## A command-line sequence uploader +The development of CPSR was accelerated by using the Arvados +Cloud platform. Arvados is an open source platform for managing, +processing, and sharing genomic and other large scientific and +biomedical data. The Arvados instance was deployed on Amazon AWS +for testing and development and a project was created that +allows for uploading data. -Peter, Pjotr +## Sequence uploader -## Metadata uploader +We wrote a Python-based uploader that authenticates with Arvados +using a token. Data gets validated for being a FASTA sequence, +FASTQ raw data and/or metadata in the form of JSON LD that gets +validated against a schema. The uploader can be used +from a command line or using a simple web interface. -With Thomas +## Creating a Pangenome -## FASTA to GFA workflow +### FASTA to GFA workflow -Michael Heuer +The first workflow (1) we implemented was a FASTA to Graphical +Fragment Assembly (GFA) Format conversion. When someone uploads a +sequence in FASTA format it gets combined with all known viral +sequences in our storage to generate a pangenome or variation +graph (VG). The full pangenome is made available as a +downloadable GFA file together with a visualisation (Figure 1). -## BAM to GFA workflow +### FASTQ to GFA workflow -Tazro & Erik +In the next step we introduced a workflow (2) that takes raw +sequence data in fastq format and converts that into FASTA. +This FASTA file, in turn, gets fed to workflow (1) to generate +the pangenome. -## Phylogeny app +## Creating linked data workflow -With Rutger +We created a workflow (3) that takes GFA and turns that into +RDF. Together with the metadata at upload time a single RDF +resource is compiled that can be linked against external +resources such as Uniprot and Wikidata. The generated RDF file +can be hosted in any triple store and queried using SPARQL. -## RDF app +## Creating a Phylogeny workflow -Jerven? +WIP -## EBI app - -? +## Other workflows? # Discussion -Future work... +CPSR is a data repository with computational pipelines that will +persist during pandemics. Unlike other data repositories for +Sars-COV-2 we created a repository that immediately computes the +pangenome of all available data and presents that in useful +formats for futher analysis, including visualisations, GFA and +RDF. Code and data are available and written using best practises +and state-of-the-art standards. CPSR can be deployed by anyone, +anywhere. + +CPSR is designed to abide by FAIR data principles (expand...) + +CPSR is primed with viral data coming from repositories that have +no sharing restrictions. The metadata includes relevant +attribution to uploaders. Some institutes have already committed +to uploading their data to CPSR first so as to warrant sharing +for computation. + +CPSR is currently running on an Arvados cluster in the cloud. To +ascertain the service remains running we will source money from +project during pandemics. The workflows are written in CWL which +means they can be deployed on any infrastructure that runs +CWL. One of the advantages of the CC-4.0 license is that we make +available all uploaded sequence and meta data, as well as +results, online to anyone. So the data can be mirrored by any +party. This guarantees the data will live on. + +<!-- Future work... --> + +We aim to add more workflows to CPSR, for example to prepare +sequence data for submitting in other public repositories, such +as EBI ENA and GISAID. This will allow researchers to share data +in multiple systems without pain, circumventing current sharing +restrictions. + +# Acknowledgements + +We thank the COVID-19 BioHackathon 2020 and ELIXIR for creating a +unique event that triggered many collaborations. We thank Curii +Corporation for their financial support for creating and running +Arvados instances. We thank Amazon AWS for their financial +support to run COVID-19 workflows. We also want to thank the +other working groups in the BioHackathon who generously +contributed onthologies, workflows and software. + # References -- cgit v1.2.3 From dcd7f12d10e7f6399a0d515606148f85358d9dc7 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Fri, 10 Apr 2020 17:52:45 -0500 Subject: Add author and affiliation --- paper/paper.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 813c91b..bc7e835 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -27,7 +27,7 @@ authors: orcid: 0000 affiliation: 7 - name: Michael Heuer - orcid: 0000 + orcid: 0000-0002-9052-6000 affiliation: 8 - name: Adam Novak orcid: 0000 @@ -46,6 +46,8 @@ affiliations: index: 1 - name: Curii, Boston, USA index: 2 + - name: RISE Lab, University of California Berkeley, Berkeley, CA, USA. + index: 8 date: 11 April 2020 event: COVID2020 group: Public Sequence Uploader -- cgit v1.2.3 From 89f996912240cfb2f5adcf95f401dd59319dac3b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 10 Apr 2020 16:28:08 -0700 Subject: Add affiliation info --- paper/paper.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 813c91b..b789f60 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -29,8 +29,8 @@ authors: - name: Michael Heuer orcid: 0000 affiliation: 8 - - name: Adam Novak - orcid: 0000 + - name: Adam M Novak + orcid: 0000-0001-5828-047X affiliation: 9 - name: Alex Kanitz orcid: 0000 @@ -46,6 +46,8 @@ affiliations: index: 1 - name: Curii, Boston, USA index: 2 + - name: UC Santa Cruz Genomics Institute, University of California, Santa Cruz, CA 95064, USA. + index: 9 date: 11 April 2020 event: COVID2020 group: Public Sequence Uploader -- cgit v1.2.3 From fcd45e42942750950076553ac995d738c863aa7a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 10 Apr 2020 16:30:21 -0700 Subject: Grab Erik --- paper/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index b789f60..e7678dc 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -31,7 +31,7 @@ authors: affiliation: 8 - name: Adam M Novak orcid: 0000-0001-5828-047X - affiliation: 9 + affiliation: 5 - name: Alex Kanitz orcid: 0000 affiliation: 10 @@ -47,7 +47,7 @@ affiliations: - name: Curii, Boston, USA index: 2 - name: UC Santa Cruz Genomics Institute, University of California, Santa Cruz, CA 95064, USA. - index: 9 + index: 5 date: 11 April 2020 event: COVID2020 group: Public Sequence Uploader -- cgit v1.2.3 From 4f4229a61e63649730e7120c764f364078860f0f Mon Sep 17 00:00:00 2001 From: lltommy Date: Sat, 11 Apr 2020 02:44:04 +0200 Subject: Adding descriptions to the yml schema, slight changes to the yml schema --- bh20sequploader/bh20seq-schema.yml | 45 ++++++++++++++++++++++++++++++++------ example/metadata.yaml | 5 +++-- example/minimal_example.yaml | 6 ++--- 3 files changed, 44 insertions(+), 12 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 8a22db1..c259f16 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -13,41 +13,51 @@ $graph: type: record fields: host_species: + doc: Host species as defined in NCBITaxon (e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens) type: string jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0000532 host_id: + doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples type: string jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 host_common_name: + doc: Text label for the host species (e.g. homo sapiens) type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NOMEN_0000037 host_sex: + doc: Sex of the host as define in NCIT, IRI expected (http://purl.obolibrary.org/obo/C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female) or unkown (http://purl.obolibrary.org/obo/NCIT_C17998)) type: string jsonldPredicate: _id: http://purl.obolibrary.org/obo/PATO_0000047 host_age: + doc: Age of the host as number (e.g. 50) type: int? jsonldPredicate: _id: http://purl.obolibrary.org/obo/PATO_0000011 host_age_unit: + doc: Unit of host age.... this field is unstable as of now (might be removed) type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/UO_0000036 host_health_status: + doc: A condition or state at a particular time type: string? jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688 host_treatment: + doc: Process in which the act is intended to modify or alter type: string? jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0000727 host_vaccination: + doc: Field is unstable type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/VO_0000001 additional_host_information: + doc: Field for additional host information type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 @@ -56,38 +66,47 @@ $graph: type: record fields: collector_name: + doc: Name of the person that took the sample type: string jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001895 collecting_institution: + doc: Institute that was responsible of sampeling type: string jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 specimen_source: + doc: A specimen that derives from an anatomical part or substance arising from an organism, e.g. tissue, organ type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001479 collection_date: + doc: Date when the sample was taken type: string? jsonldPredicate: _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164 collection_location: + doc: Geographical location where the sample was collected as Gazetteer (https://www.ebi.ac.uk/ols/ontologies/gaz) reference, e.g. http://purl.obolibrary.org/obo/GAZ_00002845 (China) type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/GAZ_00000448 sample_storage_conditions: + doc: Information aboout storage of a specified type, e.g. frozen specimen, paraffin, fresh .... type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0001472 additional_collection_information: + doc: Add additional comment about the circumstances that a sample was taken type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 sample_id: + doc: Id of the sample as defined by the submitter type: string jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 source_database_accession: + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here type: string? jsonldPredicate: _id: http://edamontology.org/data_2091 @@ -96,10 +115,12 @@ $graph: type: record fields: virus_species: + doc: The name of a taxon from the NCBI taxonomy database type: string? jsonldPredicate: _id: http://edamontology.org/data_1875 virus_strain: + doc: Name of the virus strain type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_010055 @@ -108,14 +129,17 @@ $graph: type: record fields: sample_sequencing_technology: + doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) type: string jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 sequence_assembly_method: + doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome type: string? jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0002699 sequencing_coverage: + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/FLU_0000848 @@ -124,22 +148,22 @@ $graph: type: record fields: submitter_name: + doc: Name of the submitter type: string jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000116 - submitter_date: - type: string - jsonldPredicate: - _id: http://purl.obolibrary.org/obo/NCIT_C94162 submitter_address: + doc: Address of the submitter type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000172 originating_lab: + doc: Name of the laboratory that took the sample type: string jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C37984 lab_address: + doc: Address of the laboratory where the sample was taken type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/OBI_0600047 @@ -152,10 +176,17 @@ $graph: jsonldPredicate: _id: http://www.ebi.ac.uk/efo/EFO_0001741 authors: + doc: Name of the author(s) type: string? jsonldPredicate: _id: http://purl.obolibrary.org/obo/NCIT_C42781 - submitter_id: + publication: + doc: Reference to publication of this sample (e.g. DOI, pubmed ID, ...) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C19026 + submitter_orchid: + doc: ORCHID of the submitter type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_000115 @@ -171,7 +202,7 @@ $graph: submitter: submitterSchema id: doc: The subject (eg the fasta/fastq file) that the metadata describes - type: string? + type: string jsonldPredicate: _id: "@id" - _type: "@id" + #_type: "@id" # Error (contains undefined reference) if I keep this file but we need a toplevel id to tie things together diff --git a/example/metadata.yaml b/example/metadata.yaml index c780921..d9e8e92 100644 --- a/example/metadata.yaml +++ b/example/metadata.yaml @@ -1,3 +1,5 @@ +id: placeholder + host: host_id: XX1 host_species: string @@ -36,5 +38,4 @@ submitter: provider_sample_id: string submitter_sample_id: string authors: testAuthor - submitter_id: X12 - submitter_date: Subdate + submitter_orchid: X12 diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml index f312ab7..160d1d4 100644 --- a/example/minimal_example.yaml +++ b/example/minimal_example.yaml @@ -1,8 +1,9 @@ -submission: publicSequenceResource +id: placeholder host: host_id: XX host_species: string + host_sex: string sample: sample_id: XXX @@ -14,5 +15,4 @@ technology: submitter: submitter_name: tester - originating_lab: testLab - submitter_date: Subdate \ No newline at end of file + originating_lab: testLab \ No newline at end of file -- cgit v1.2.3 From 31686ba2e1b2edec29229ab5adb28b366893f17b Mon Sep 17 00:00:00 2001 From: lltommy Date: Sat, 11 Apr 2020 04:02:55 +0200 Subject: Adding noLinkCheck to root --- bh20sequploader/bh20seq-schema.yml | 4 +++- bh20sequploader/rdf-mappings.ttl | 0 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 bh20sequploader/rdf-mappings.ttl diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index c259f16..81a7f22 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -13,6 +13,7 @@ $graph: type: record fields: host_species: + ## autocomplete # NCBITAXON doc: Host species as defined in NCBITaxon (e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens) type: string jsonldPredicate: @@ -205,4 +206,5 @@ $graph: type: string jsonldPredicate: _id: "@id" - #_type: "@id" # Error (contains undefined reference) if I keep this file but we need a toplevel id to tie things together + _type: "@id" + noLinkCheck: true diff --git a/bh20sequploader/rdf-mappings.ttl b/bh20sequploader/rdf-mappings.ttl new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3