diff options
Diffstat (limited to 'bh20sequploader')
-rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 89 | ||||
-rw-r--r-- | bh20sequploader/main.py | 22 | ||||
-rw-r--r-- | bh20sequploader/qc_metadata.py | 23 |
3 files changed, 129 insertions, 5 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml new file mode 100644 index 0000000..5c962d1 --- /dev/null +++ b/bh20sequploader/bh20seq-schema.yml @@ -0,0 +1,89 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ +$graph: + +- name: hostSchema + type: record + fields: + host_species: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + host_id: string + host_common_name: string? + host_sex: string? + host_age: int? + host_age_unit: string? + host_health_status: string? + host_treatment: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 + additional_host_information: string? + +- name: sampleSchema + type: record + fields: + collector_name: string + collecting_institution: string + specimen_source: string? + collection_date: string? + collection_location: + type: string? + jsonldPredicate: + _id: https://schema.org/fromLocation + sample_storage_conditions: string? + additional_collection_information: string? + +- name: virusSchema + type: record + fields: + virus_species: string? + virus_strain: string? + +- name: technologySchema + type: record + fields: + sample_sequencing_technology: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + sequence_assembly_method: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + +- name: submitterSchema + type: record + fields: + submitter_name: string + submitter_address: string? + originating_lab: string + lab_address: string? + provider_sample_id: string? + submitter_sample_id: string? + authors: string? + submitter_id: string? + +- name: MainSchema + type: record + documentRoot: true + fields: + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema + sequencefile: + doc: The subject (eg the fasta/fastq file) that this metadata describes + type: string? + jsonldPredicate: + _id: "@id" + _type: "@id" diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 17ad492..56cbe22 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +from .qc_metadata import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,18 +20,26 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) + col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): + target = "sequence.fasta" + elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + target = "reads.fastq" + + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: @@ -49,4 +58,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) -main() + print("Done") + +if __name__ == "__main__": + main() diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py new file mode 100644 index 0000000..ebe4dfc --- /dev/null +++ b/bh20sequploader/qc_metadata.py @@ -0,0 +1,23 @@ +import schema_salad.schema +import logging +import pkg_resources +import logging + +def qc_metadata(metadatafile): + schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") + cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} + (document_loader, + avsc_names, + schema_metadata, + metaschema_loader) = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) + + if not isinstance(avsc_names, schema_salad.avro.schema.Names): + print(avsc_names) + return False + + try: + doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) + return True + except Exception as e: + logging.warn(e) + return False |