diff options
Diffstat (limited to 'bh20sequploader')
| -rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 210 | ||||
| -rw-r--r-- | bh20sequploader/main.py | 26 | ||||
| -rw-r--r-- | bh20sequploader/qc_fasta.py | 28 | ||||
| -rw-r--r-- | bh20sequploader/qc_metadata.py | 26 | ||||
| -rw-r--r-- | bh20sequploader/rdf-mappings.ttl | 0 | ||||
| -rw-r--r-- | bh20sequploader/validation/Makefile | 4 | ||||
| -rw-r--r-- | bh20sequploader/validation/formats | 4 | ||||
| -rw-r--r-- | bh20sequploader/validation/formats.mgc | bin | 0 -> 1032 bytes | 
8 files changed, 292 insertions, 6 deletions
| diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..81a7f22 --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,210 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ + sio: http://semanticscience.org/resource/ + edam: http://edamontology.org/ + evs: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# + +$graph: + +- name: hostSchema + type: record + fields: + host_species: + ## autocomplete # NCBITAXON + doc: Host species as defined in NCBITaxon (e.g. http://purl.obolibrary.org/obo/NCBITaxon_9606 for Homo sapiens) + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + host_id: + doc: Identifer for the host. If you submit multiple samples from the same host, use the same host_id for those samples + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + host_common_name: + doc: Text label for the host species (e.g. homo sapiens) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NOMEN_0000037 + host_sex: + doc: Sex of the host as define in NCIT, IRI expected (http://purl.obolibrary.org/obo/C20197 (Male), http://purl.obolibrary.org/obo/NCIT_C27993 (Female) or unkown (http://purl.obolibrary.org/obo/NCIT_C17998)) + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000047 + host_age: + doc: Age of the host as number (e.g. 50) + type: int? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/PATO_0000011 + host_age_unit: + doc: Unit of host age.... this field is unstable as of now (might be removed) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/UO_0000036 + host_health_status: + doc: A condition or state at a particular time + type: string? + jsonldPredicate: http://purl.obolibrary.org/obo/NCIT_C25688 + host_treatment: + doc: Process in which the act is intended to modify or alter + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 + host_vaccination: + doc: Field is unstable + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/VO_0000001 + additional_host_information: + doc: Field for additional host information + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + +- name: sampleSchema + type: record + fields: + collector_name: + doc: Name of the person that took the sample + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001895 + collecting_institution: + doc: Institute that was responsible of sampeling + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + specimen_source: + doc: A specimen that derives from an anatomical part or substance arising from an organism, e.g. tissue, organ + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001479 + collection_date: + doc: Date when the sample was taken + type: string? + jsonldPredicate: + _id: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C25164 + collection_location: + doc: Geographical location where the sample was collected as Gazetteer (https://www.ebi.ac.uk/ols/ontologies/gaz) reference, e.g. http://purl.obolibrary.org/obo/GAZ_00002845 (China) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/GAZ_00000448 + sample_storage_conditions: + doc: Information aboout storage of a specified type, e.g. frozen specimen, paraffin, fresh .... + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0001472 + additional_collection_information: + doc: Add additional comment about the circumstances that a sample was taken + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_001167 + sample_id: + doc: Id of the sample as defined by the submitter + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + source_database_accession: + doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here + type: string? + jsonldPredicate: + _id: http://edamontology.org/data_2091 + +- name: virusSchema + type: record + fields: + virus_species: + doc: The name of a taxon from the NCBI taxonomy database + type: string? + jsonldPredicate: + _id: http://edamontology.org/data_1875 + virus_strain: + doc: Name of the virus strain + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_010055 + +- name: technologySchema + type: record + fields: + sample_sequencing_technology: + doc: Technology that was used to sequence this sample (e.g Sanger, Nanopor MiniION) + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0600047 + sequence_assembly_method: + doc: Protocol which provides instructions on the alignment of sequencing reads to reference genome + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + doc: Sequence coverage defined as the average number of reads representing a given nucleotide (e.g. 100x) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + +- name: submitterSchema + type: record + fields: + submitter_name: + doc: Name of the submitter + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000116 + submitter_address: + doc: Address of the submitter + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000172 + originating_lab: + doc: Name of the laboratory that took the sample + type: string + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37984 + lab_address: + doc: Address of the laboratory where the sample was taken + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/OBI_0600047 + provider_sample_id: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C37900 + submitter_sample_id: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0001741 + authors: + doc: Name of the author(s) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C42781 + publication: + doc: Reference to publication of this sample (e.g. DOI, pubmed ID, ...) + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/NCIT_C19026 + submitter_orchid: + doc: ORCHID of the submitter + type: string? + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 + +- name: MainSchema + type: record + documentRoot: true + fields: + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema + id: + doc: The subject (eg the fasta/fastq file) that the metadata describes + type: string + jsonldPredicate: + _id: "@id" + _type: "@id" + noLinkCheck: true diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..4a225f6 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -3,9 +3,13 @@ import time import arvados import arvados.collection import json +import magic +from pathlib import Path import urllib.request import socket import getpass +from .qc_metadata import qc_metadata +from .qc_fasta import qc_fasta ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -13,29 +17,36 @@ UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa' def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') - parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA') + parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') args = parser.parse_args() api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + target = qc_fasta(args.sequence) + + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) + col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: f.write(r) r = args.sequence.read(65536) + args.sequence.close() - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: f.write(r) r = args.metadata.read(65536) + args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') @@ -49,4 +60,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() + print("Done") + +if __name__ == "__main__": + main() diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py new file mode 100644 index 0000000..e47d66b --- /dev/null +++ b/bh20sequploader/qc_fasta.py @@ -0,0 +1,28 @@ +import pkg_resources +import tempfile +import magic + +def qc_fasta(sequence): + schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(schema_resource.read()) + tmp.flush() + val = magic.Magic(magic_file=tmp.name, + uncompress=False, mime=True) + seq_type = val.from_buffer(sequence.read(4096)).lower() + sequence.seek(0) + if seq_type == "text/fasta": + # ensure that contains only one entry + entries = 0 + for line in sequence: + if line.startswith(">"): + entries += 1 + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + sequence.seek(0) + return "sequence.fasta" + elif seq_type == "text/fastq": + return "reads.fastq" + else: + raise ValueError("Sequence file does not look like FASTA or FASTQ") diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..e477f21 --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,26 @@ +import schema_salad.schema +import schema_salad.ref_resolver +import logging +import pkg_resources +import logging +import traceback + +def qc_metadata(metadatafile): + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) + + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except Exception as e: + traceback.print_exc() + logging.warn(e) + return False diff --git a/bh20sequploader/rdf-mappings.ttl b/bh20sequploader/rdf-mappings.ttl new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/bh20sequploader/rdf-mappings.ttl diff --git a/bh20sequploader/validation/Makefile b/bh20sequploader/validation/Makefile new file mode 100644 index 0000000..1ca13fb --- /dev/null +++ b/bh20sequploader/validation/Makefile @@ -0,0 +1,4 @@ +compile: formats.mgc + +formats.mgc : + file -C -m formats diff --git a/bh20sequploader/validation/formats b/bh20sequploader/validation/formats new file mode 100644 index 0000000..ac804cf --- /dev/null +++ b/bh20sequploader/validation/formats @@ -0,0 +1,4 @@ +0 regex \^\>.+\r?\n([acgtnACGTN]+\r?\n)*[acgtnACGTN]+(\r?\n)?$ FASTA +!:mime text/fasta +0 regex \^@.+\r?\n[acgtnACGTN]*\n\\+.*\n[!-i]*(\r\n)? FASTQ +!:mime text/fastq \ No newline at end of file diff --git a/bh20sequploader/validation/formats.mgc b/bh20sequploader/validation/formats.mgc new file mode 100644 index 0000000..bff282a --- /dev/null +++ b/bh20sequploader/validation/formats.mgc Binary files differ | 
