diff options
Diffstat (limited to 'bh20sequploader')
-rw-r--r-- | bh20sequploader/bh20seq-schema.yml | 89 | ||||
-rw-r--r-- | bh20sequploader/main.py | 19 | ||||
-rw-r--r-- | bh20sequploader/qc_metadata.py | 6 |
3 files changed, 90 insertions, 24 deletions
diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 6e0973a..5c962d1 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -1,36 +1,89 @@ +$base: http://biohackathon.org/bh20-seq-schema +$namespaces: + sch: https://schema.org/ + efo: http://www.ebi.ac.uk/efo/ + obo: http://purl.obolibrary.org/obo/ $graph: -- name: sampleInformationSchema +- name: hostSchema type: record fields: - location: string - host: string - sequenceTechnology: string - assemblyMethod: string + host_species: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + host_id: string + host_common_name: string? + host_sex: string? + host_age: int? + host_age_unit: string? + host_health_status: string? + host_treatment: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000727 + additional_host_information: string? -- name: InstituteInformationSchema +- name: sampleSchema type: record fields: - OriginatingLab: string - SubmittingLab: string + collector_name: string + collecting_institution: string + specimen_source: string? + collection_date: string? + collection_location: + type: string? + jsonldPredicate: + _id: https://schema.org/fromLocation + sample_storage_conditions: string? + additional_collection_information: string? -- name: SubmitterInformationSchema +- name: virusSchema type: record fields: - Submitter: string - submissionDate: string + virus_species: string? + virus_strain: string? -- name: VirusDetailSchema +- name: technologySchema type: record fields: - VirusName: string - AccessionId: string + sample_sequencing_technology: + type: string + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0000532 + sequence_assembly_method: + type: string? + jsonldPredicate: + _id: http://www.ebi.ac.uk/efo/EFO_0002699 + sequencing_coverage: + type: string? + jsonldPredicate: + _id: http://purl.obolibrary.org/obo/FLU_0000848 + +- name: submitterSchema + type: record + fields: + submitter_name: string + submitter_address: string? + originating_lab: string + lab_address: string? + provider_sample_id: string? + submitter_sample_id: string? + authors: string? + submitter_id: string? - name: MainSchema type: record documentRoot: true fields: - sampleInformation: sampleInformationSchema - InstituteInformation: InstituteInformationSchema - SubmitterInformation: SubmitterInformationSchema - VirusDetail: VirusDetailSchema + host: hostSchema + sample: sampleSchema + virus: virusSchema? + technology: technologySchema + submitter: submitterSchema + sequencefile: + doc: The subject (eg the fasta/fastq file) that this metadata describes + type: string? + jsonldPredicate: + _id: "@id" + _type: "@id" diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index d3ebc0c..bf74ea5 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -6,6 +6,7 @@ import json import urllib.request import socket import getpass +import qc_metadata ARVADOS_API_HOST='lugli.arvadosapi.com' ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462' @@ -19,18 +20,26 @@ def main(): api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) + if not qc_metadata(args.metadata.name): + print("Failed metadata qc") + exit(1) + col = arvados.collection.Collection(api_client=api) - print("Reading FASTA") - with col.open("sequence.fasta", "w") as f: + if args.sequence.name.endswith("fasta") or args.sequence.name.endswith("fa"): + target = "sequence.fasta" + elif args.sequence.name.endswith("fastq") or args.sequence.name.endswith("fq"): + target = "reads.fastq" + + with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: f.write(r) r = args.sequence.read(65536) - print("Reading JSONLD") - with col.open("metadata.jsonld", "w") as f: + print("Reading metadata") + with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: @@ -49,5 +58,7 @@ def main(): (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) + print("Done") + if __name__ == "__main__": main() diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 78b31b2..ebe4dfc 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -1,6 +1,7 @@ import schema_salad.schema import logging import pkg_resources +import logging def qc_metadata(metadatafile): schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") @@ -17,5 +18,6 @@ def qc_metadata(metadatafile): try: doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) return True - except: - return False + except Exception as e: + logging.warn(e) + return False |