diff options
author | lltommy | 2020-04-21 16:49:47 +0200 |
---|---|---|
committer | lltommy | 2020-04-21 16:49:47 +0200 |
commit | 85b85b676d7ecc218d9f84357b2e7ea0133eed94 (patch) | |
tree | 0c2b445aa1fd653b57be6ddf341183f73b28350f /scripts | |
parent | ecae9863069585d88fc88b7f2a7434479f7425c1 (diff) | |
download | bh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.tar.gz bh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.tar.lz bh20-seq-resource-85b85b676d7ecc218d9f84357b2e7ea0133eed94.zip |
Updated shex and manditory fields and stuff
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/from_genbank_to_fasta_and_yaml.py | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py index 0c410d7..7e7c089 100644 --- a/scripts/from_genbank_to_fasta_and_yaml.py +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -1,5 +1,5 @@ from Bio import Entrez -Entrez.email = 'insert_your_email@gmail.com' +Entrez.email = 'another_email@gmail.com' import xml.etree.ElementTree as ET import yaml @@ -31,6 +31,8 @@ for term in term_list: tmp_list = [x.split('.')[0] for x in tmp_list] print(term, len(tmp_list)) + tmp_list=tmp_list +# tmp_list = tmp_list[0:2] # restricting to small run id_set.update([x.split('.')[0] for x in tmp_list]) @@ -78,7 +80,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x term_to_uri_dict[term] = uri species_to_taxid_dict = { - 'Homo sapiens': 9606 + 'Homo sapiens': 'http://purl.obolibrary.org/obo/NCBITaxon_9606' } @@ -108,8 +110,8 @@ if not os.path.exists(dir_fasta_and_yaml_today): 'submitter': {} } - info_for_yaml_dict['sample']['sample_id'] = accession_version + info_for_yaml_dict['sample']['source_database_accession'] = accession_version info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')]) @@ -163,7 +165,7 @@ if not os.path.exists(dir_fasta_and_yaml_today): if GBQualifier_name_text == 'host': GBQualifier_value_text_list = GBQualifier_value_text.split('; ') - info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] + #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed if GBQualifier_value_text_list[0] in species_to_taxid_dict: info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]] @@ -206,8 +208,13 @@ if not os.path.exists(dir_fasta_and_yaml_today): elif GBQualifier_name_text == 'isolate': info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text elif GBQualifier_name_text == 'db_xref': - info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1]) - + info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] + + + #Remove technology key if empty! + if (info_for_yaml_dict['technology']=={}): + del info_for_yaml_dict['key'] + with open(os.path.join(dir_fasta_and_yaml_today, '{}.fasta'.format(accession_version)), 'w') as fw: fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) |