From 85b85b676d7ecc218d9f84357b2e7ea0133eed94 Mon Sep 17 00:00:00 2001 From: lltommy Date: Tue, 21 Apr 2020 16:49:47 +0200 Subject: Updated shex and manditory fields and stuff --- bh20sequploader/bh20seq-schema.yml | 10 +++++----- bh20sequploader/bh20seq-shex.rdf | 4 ++-- example/minimal_example.yaml | 6 +----- scripts/from_genbank_to_fasta_and_yaml.py | 19 +++++++++++++------ 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/bh20sequploader/bh20seq-schema.yml b/bh20sequploader/bh20seq-schema.yml index 57f3b3d..75308ab 100644 --- a/bh20sequploader/bh20seq-schema.yml +++ b/bh20sequploader/bh20seq-schema.yml @@ -66,6 +66,11 @@ $graph: - name: sampleSchema type: record fields: + sample_id: + doc: Id of the sample as defined by the submitter + type: string + jsonldPredicate: + _id: http://semanticscience.org/resource/SIO_000115 collection_date: doc: Date when the sample was taken type: string @@ -111,11 +116,6 @@ $graph: type: string? jsonldPredicate: _id: http://semanticscience.org/resource/SIO_001167 - sample_id: - doc: Id of the sample as defined by the submitter - type: string? - jsonldPredicate: - _id: http://semanticscience.org/resource/SIO_000115 source_database_accession: doc: If data is deposit at a public resource (e.g. Genbank, ENA) enter the Accession Id here type: string? diff --git a/bh20sequploader/bh20seq-shex.rdf b/bh20sequploader/bh20seq-shex.rdf index 6e646c7..59ee71b 100644 --- a/bh20sequploader/bh20seq-shex.rdf +++ b/bh20sequploader/bh20seq-shex.rdf @@ -28,11 +28,11 @@ PREFIX wikidata: } :sampleShape { - evs:C25164 xsd:string?; + sio:SIO_000115 xsd:string; obo:GAZ_00000448 [wikidata:~] ; + evs:C25164 xsd:string; obo:OBI_0001895 xsd:string ?; sio:SIO_001167 xsd:string ?; - sio:SIO_000115 xsd:string ?; obo:OBI_0001472 xsd:string ?; obo:OBI_0001479 IRI {0,2}; } diff --git a/example/minimal_example.yaml b/example/minimal_example.yaml index ed578e2..0e36a25 100644 --- a/example/minimal_example.yaml +++ b/example/minimal_example.yaml @@ -1,13 +1,10 @@ id: placeholder host: - host_id: XX1 host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 sample: sample_id: XX - collector_name: John Doe - collecting_institution: Doe university collection_date: 2020-01 collection_location: http://www.wikidata.org/entity/Q148 @@ -18,5 +15,4 @@ technology: sample_sequencing_technology: http://www.ebi.ac.uk/efo/EFO_0008632 submitter: - submitter_name: John Doe - originating_lab: John Doe's kitchen \ No newline at end of file + authors: John Doe \ No newline at end of file diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 0c410d7..7e7c089 100644 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -1,5 +1,5 @@ from Bio import Entrez -Entrez.email = 'insert_your_email@gmail.com' +Entrez.email = 'another_email@gmail.com' import xml.etree.ElementTree as ET import yaml @@ -31,6 +31,8 @@ for term in term_list: tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) + tmp_list=tmp_list +# tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) @@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x term_to_uri_dict[term] = uri species_to_taxid_dict = { - 'Homo sapiens': 9606 + 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606' } @@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today): 'submitter': {} } - info_for_yaml_dict['sample']['sample_id'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = accession_version info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')]) @@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today): if GBQualifier_name_text == 'host': GBQualifier_value_text_list = GBQualifier_value_text.split('; ') - info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] + #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed if GBQualifier_value_text_list[0] in species_to_taxid_dict: info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]] @@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today): elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': - info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1]) - + info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] + + + #Remove technology key if empty! + if (info_for_yaml_dict['technology']=={}): + del info_for_yaml_dict['key'] + with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw: fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) -- cgit v1.2.3