diff options
-rw-r--r-- | scripts/from_genbank_to_fasta_and_yaml.py | 141 | ||||
-rw-r--r-- | scripts/sequences.acc | 877 |
2 files changed, 1018 insertions, 0 deletions
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py new file mode 100644 index 0000000..0cc1a57 --- /dev/null +++ b/scripts/from_genbank_to_fasta_and_yaml.py @@ -0,0 +1,141 @@ +from Bio import Entrez +Entrez.email = 'your_email_to_be_polite' + +import xml.etree.ElementTree as ET +import yaml +import os + +path_ncbi_virus_accession = 'sequences.acc' + +date = '20200414' +path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date) +path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date) + +# Take all the ids +id_set = set() + +term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] +for term in term_list: + tmp_list = Entrez.read( + Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000') + )['IdList'] + print(term, len(tmp_list)) + + # Remove the version in the id + id_set.update([x.split('.')[0] for x in tmp_list]) + +print(term_list, len(id_set)) + +with open(path_ncbi_virus_accession) as f: + tmp_list = [line.strip('\n') for line in f] + +print('NCBI Virus', len(tmp_list)) +id_set.update(tmp_list) + +print(term_list + ['NCBI Virus'], len(id_set)) + +if not os.path.exists(path_metadata_xml): + # TO_DO: to check if I already have the records? + + with open(path_metadata_xml, 'w') as fw: + fw.write( + Entrez.efetch(db='nuccore', id=list(id_set), retmode='xml').read() + ) + + +tree = ET.parse(path_metadata_xml) +GBSet = tree.getroot() + +species_to_taxid_dict = { + 'Homo sapiens': 9606 +} + +for GBSeq in GBSet: + accession_version = GBSeq.find('GBSeq_accession-version').text + + GBSeq_sequence = GBSeq.find('GBSeq_sequence') + if GBSeq_sequence is None: + print(accession_version, ' - sequence not found') + continue + + + # A general default-empty yaml could be read from the definitive one + info_for_yaml_dict = { + 'id': 'placeholder', + 'host': {}, + 'sample': {}, + 'virus': {}, + 'technology': {}, + 'submitter': {} + } + + + info_for_yaml_dict['sample']['sample_id'] = accession_version + info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')]) + + + GBSeq_comment = GBSeq.find('GBSeq_comment') + if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text: + GBSeq_comment_text = GBSeq_comment.text.split('##Assembly-Data-START## ; ')[1].split(' ; ##Assembly-Data-END##')[0] + + for info_to_check, field_in_yaml in zip( + ['Assembly Method', 'Coverage', 'Sequencing Technology'], + ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology'] + ): + if info_to_check in GBSeq_comment_text: + info_for_yaml_dict['technology'][field_in_yaml] = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0] + + + for GBFeature in GBSeq.iter('GBFeature'): + if GBFeature.find('GBFeature_key').text != 'source': + continue + + for GBQualifier in GBFeature.iter('GBQualifier'): + GBQualifier_value = GBQualifier.find('GBQualifier_value') + if GBQualifier_value is None: + continue + GBQualifier_value_text = GBQualifier_value.text + + GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text + + if GBQualifier_name_text == 'host': + GBQualifier_value_text_list = GBQualifier_value_text.split('; ') + + info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] + + if GBQualifier_value_text_list[0] in species_to_taxid_dict: + info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]] + + if len(GBQualifier_value_text_list) > 1: + if GBQualifier_value_text_list[1] in ['male', 'female']: + info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1] + else: + info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1] + + if 'age' in GBQualifier_value_text: + info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1]) + info_for_yaml_dict['host']['host_age_unit'] = 'year' + elif GBQualifier_name_text == 'collected_by': + if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]): + info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text + else: + info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text + elif GBQualifier_name_text == 'isolation_source': + info_for_yaml_dict['sample']['specimen_source'] = GBQualifier_value_text + elif GBQualifier_name_text == 'collection_date': + # TO_DO: which format we will use? + info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text + elif GBQualifier_name_text in ['lat_lon', 'country']: + info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text + elif GBQualifier_name_text == 'note': + info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text + elif GBQualifier_name_text == 'isolate': + info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text + elif GBQualifier_name_text == 'db_xref': + info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1]) + + with open('{}.fasta'.format(accession_version), 'w') as fw: + fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) + + with open('{}.yaml'.format(accession_version), 'w') as fw: + yaml.dump(info_for_yaml_dict, fw, default_flow_style=False) diff --git a/scripts/sequences.acc b/scripts/sequences.acc new file mode 100644 index 0000000..62bde2c --- /dev/null +++ b/scripts/sequences.acc @@ -0,0 +1,877 @@ +MT325599 +MT325601 +MT325602 +MT325607 +MT325608 +MT325609 +MT325610 +MT325612 +MT325616 +MT325617 +MT325618 +MT325622 +MT325623 +MT325600 +MT325606 +MT325611 +MT325613 +MT325615 +MT325619 +MT325620 +MT325624 +MT325625 +MT325565 +MT325566 +MT326147 +MT326153 +MT326173 +MT326174 +MT326176 +MT326186 +MT326023 +MT326026 +MT326027 +MT326034 +MT326039 +MT326042 +MT326045 +MT326047 +MT326051 +MT326054 +MT326060 +MT326061 +MT326064 +MT326065 +MT326068 +MT326072 +MT326076 +MT326078 +MT326079 +MT325590 +MT325640 +MT326130 +MT326129 +MT326128 +MT326121 +MT326119 +MT326109 +MT326100 +MT325568 +MT324679 +MT325561 +MT325571 +MT325585 +MT325587 +MT325588 +MT325589 +MT325596 +MT325597 +MT325603 +MT325614 +MT325621 +MT325629 +MT325630 +MT325638 +MT325639 +MT326086 +MT326087 +MT326102 +MT326104 +MT326105 +MT326123 +MT328033 +MT328034 +MT325562 +MT325564 +MT325567 +MT326154 +MT326155 +MT326156 +MT326157 +MT326163 +MT326165 +MT326175 +MT326177 +MT326184 +MT326185 +MT326187 +MT325572 +MT325575 +MT325583 +MT325584 +MT325604 +MT325631 +MT325632 +MT325635 +MT325636 +MT325637 +MT326095 +MT326096 +MT326103 +MT326112 +MT326113 +MT326114 +MT326115 +MT326122 +MT326131 +MT326132 +MT326133 +MT325563 +MT326164 +MT326166 +MT326167 +MT325569 +MT326097 +MT326106 +MT326107 +MT326116 +MT326117 +MT326124 +MT326125 +MT326126 +MT326127 +MT326134 +MT326135 +MT326136 +MT326137 +MT326138 +MT326139 +MT326140 +MT326141 +MT326142 +MT326143 +MT326144 +MT326145 +MT326146 +MT326148 +MT326149 +MT326150 +MT326151 +MT326152 +MT326158 +MT326159 +MT326160 +MT326161 +MT326162 +MT326168 +MT326169 +MT326170 +MT326171 +MT326172 +MT326178 +MT326179 +MT326180 +MT326181 +MT326182 +MT326183 +MT326188 +MT326189 +MT326190 +MT326191 +MT326120 +MT326118 +MT326111 +MT326110 +MT326108 +MT326101 +MT326099 +MT326098 +MT326094 +MT326093 +MT326092 +MT326091 +MT326090 +MT326085 +MT326084 +MT326083 +MT326082 +MT326081 +MT326080 +MT326077 +MT326067 +MT326057 +MT326024 +MT326025 +MT326032 +MT326033 +MT326035 +MT326036 +MT326037 +MT326040 +MT326041 +MT326043 +MT326044 +MT326046 +MT326049 +MT326050 +MT326052 +MT326053 +MT326055 +MT326056 +MT326059 +MT326062 +MT326063 +MT326066 +MT326069 +MT326070 +MT326071 +MT326073 +MT326074 +MT326075 +MT326088 +MT326089 +MT327745 +MT324062 +MT324680 +MT324684 +MT325573 +MT325574 +MT325576 +MT325577 +MT325578 +MT325580 +MT325591 +MT325592 +MT325593 +MT325595 +MT325605 +MT325627 +MT326028 +MT326029 +MT326031 +MT326048 +MT325570 +MT325579 +MT325581 +MT325582 +MT325586 +MT325594 +MT325598 +MT325626 +MT325628 +MT325633 +MT325634 +MT326030 +MT326038 +MT326058 +MT324681 +MT324682 +MT324683 +MT328032 +MT328035 +MT039874 +MT077125 +MT322394 +MT322397 +MT322398 +MT322399 +MT322400 +MT322401 +MT322403 +MT322404 +MT322405 +MT322406 +MT322408 +MT322409 +MT322410 +MT322411 +MT322412 +MT322413 +MT322414 +MT322415 +MT322416 +MT322417 +MT322418 +MT322419 +MT322420 +MT322421 +MT322422 +MT322423 +MT322424 +MT322396 +MT322402 +MT322395 +MT322407 +MT320538 +MT320891 +MT308692 +MT308693 +MT308698 +MT308699 +MT308703 +MT308704 +MT308694 +MT308695 +MT308696 +MT308697 +MT308700 +MT308701 +MT308702 +MT304476 +MT304474 +MT304475 +MT293547 +MT304477 +MT304483 +MT300186 +MT304478 +MT304479 +MT304480 +MT304481 +MT304482 +MT304484 +MT304485 +MT304486 +MT304487 +MT304488 +MT304489 +MT304490 +MT304491 +MT291831 +MT291836 +MT291834 +MT291835 +MT292570 +MT293173 +MT292574 +MT293179 +MT293181 +MT293183 +MT293195 +MT293196 +MT293201 +MT293204 +MT291829 +MT291830 +MT291827 +MT292572 +MT292577 +MT293186 +MT293187 +MT293188 +MT292580 +MT292581 +MT292571 +MT292576 +MT292578 +MT293185 +MT293160 +MT293161 +MT293199 +MT292579 +MT291828 +MT293166 +MT293167 +MT293168 +MT293175 +MT293190 +MT293191 +MT273658 +MT293159 +MT292582 +MT293162 +MT293163 +MT293164 +MT293165 +MT293156 +MT293157 +MT293158 +MT281577 +MT293171 +MT293174 +MT293176 +MT293182 +MT293210 +MT293211 +MT293217 +MT293218 +MT295465 +MT293213 +MT293221 +MT295464 +MT292569 +MT293169 +MT293172 +MT293177 +MT293200 +MT293198 +MT293205 +MT293207 +MT293212 +MT293216 +MT293219 +MT293222 +MT293224 +MT293225 +MT293206 +MT293208 +MT293209 +MT293214 +MT293215 +MT293220 +MT293170 +MT292573 +MT293178 +MT292575 +MT293180 +MT293184 +MT293189 +MT293192 +MT293193 +MT293194 +MT293197 +MT293202 +MT293203 +MT293223 +MT291826 +MT291832 +MT291833 +MT281530 +MT276331 +MT276325 +MT276324 +MT276326 +MT276327 +MT276330 +MT276329 +MT276597 +MT276598 +MT276323 +MT276328 +MT262896 +MT263385 +MT263392 +MT262900 +MT262901 +MT262902 +MT262909 +MT262911 +MT262912 +MT263382 +MT263383 +MT263384 +MT263423 +MT263431 +MT263421 +MT263443 +MT263461 +MT263420 +MT263429 +MT263434 +MT263435 +MT263437 +MT263445 +MT263428 +MT263433 +MT263436 +MT263438 +MT263440 +MT263444 +MT263452 +MT263455 +MT263456 +MT263462 +MT263463 +MT263466 +MT263446 +MT263447 +MT263448 +MT263449 +MT263451 +MT263453 +MT263388 +MT263391 +MT262903 +MT262906 +MT262907 +MT262908 +MT262913 +MT262914 +MT263390 +MT263398 +MT263403 +MT263430 +MT263399 +MT263404 +MT263405 +MT263414 +MT263389 +MT263393 +MT263394 +MT263395 +MT263396 +MT263397 +MT263402 +MT263469 +MT263441 +MT263454 +MT263467 +MT263465 +MT263468 +MT263439 +MT263457 +MT263460 +MT263450 +MT263458 +MT263459 +MT263464 +MT263386 +MT263387 +MT262897 +MT262899 +MT262904 +MT262905 +MT262910 +MT263408 +MT263412 +MT263416 +MT263417 +MT263422 +MT263432 +MT263419 +MT263424 +MT263427 +MT263442 +MT263413 +MT263418 +MT263425 +MT263401 +MT263409 +MT263410 +MT263411 +MT263426 +MT263406 +MT263407 +MT263415 +MT262993 +MT263074 +MT263381 +MT262898 +MT262915 +MT262916 +MT263400 +MT259257 +MT259261 +MT259262 +MT259263 +MT259264 +MT259268 +MT259269 +MT259270 +MT259271 +MT259272 +MT259273 +MT259275 +MT259276 +MT259277 +MT259279 +MT259244 +MT259245 +MT258381 +MT258377 +MT258379 +MT259226 +MT259281 +MT259282 +MT259283 +MT258378 +MT259231 +MT259274 +MT259286 +MT256917 +MT259227 +MT259238 +MT258382 +MT259246 +MT259253 +MT259254 +MT259255 +MT259259 +MT259265 +MT259284 +MT259252 +MT259229 +MT259230 +MT259260 +MT259285 +MT259278 +MT259280 +MT259247 +MT259240 +MT259243 +MT259249 +MT259250 +MT259251 +MT259256 +MT259258 +MT259266 +MT259267 +MT259287 +MT259241 +MT259242 +MT259228 +MT259236 +MT258383 +MT259248 +MT256918 +MT258380 +MT259235 +MT259237 +MT259239 +MT256924 +LC534419 +LC534418 +MT253704 +MT253710 +MT253701 +MT253702 +MT253703 +MT253706 +MT253707 +MT251972 +MT251973 +MT251975 +MT251976 +MT251978 +MT251979 +MT251977 +MT251980 +MT253696 +MT253697 +MT253698 +MT253699 +MT251974 +MT253700 +MT253705 +MT253709 +MT253708 +MT233526 +MT246667 +MT246451 +MT246453 +MT246454 +MT246461 +MT246462 +MT246490 +MT246450 +MT246452 +MT246464 +MT246470 +MT246474 +MT246480 +MT246481 +MT246482 +MT246457 +MT246459 +MT246466 +MT246489 +MT246456 +MT246458 +MT246475 +MT246476 +MT246477 +MT246479 +MT246487 +MT246449 +MT246455 +MT246468 +MT246469 +MT246486 +MT246488 +MT246467 +MT246478 +MT246485 +MT246460 +MT246463 +MT246465 +MT246471 +MT246472 +MT246473 +MT246483 +MT246484 +MT240479 +MT232869 +MT232870 +MT233522 +MT232871 +MT232872 +MT233520 +MT233523 +MT233519 +MT233521 +MT226610 +MT198653 +MT198651 +MT198652 +MT192759 +MT192765 +MT192772 +MT192773 +MT192758 +MT188341 +MT188339 +MT188340 +MT186680 +MT186676 +MT186677 +MT186679 +MT186678 +MT187977 +MT186681 +MT186682 +MT184912 +MT184910 +MT184911 +MT184913 +MT184909 +MT184907 +MT184908 +CADDYA000000000 +MT163712 +MT163716 +MT163719 +MT163720 +MT163715 +MT163721 +MT163714 +MT163717 +MT163737 +MT163738 +MT163718 +MT159706 +MT159707 +MT159717 +MT159716 +MT159719 +MT159709 +MT159710 +MT159712 +MT159713 +MT159714 +MT159722 +MT159711 +MT159715 +MT159718 +MT159720 +MT159721 +MT159708 +MT121215 +MT159778 +MT159705 +MT161607 +MT066156 +LC529905 +MT050493 +MT012098 +MT152824 +MT152900 +MT135043 +MT135042 +MT135041 +MT135044 +MT127116 +MT127113 +MT127114 +MT127115 +MT126808 +LC528233 +LC528232 +MT123290 +MT123291 +MT123292 +MT123293 +MT118835 +MT111896 +MT111895 +MT106052 +MT106053 +MT106054 +MT093571 +MT093631 +MT081059 +MT081068 +MT081060 +MT081061 +MT081065 +MT081067 +MT081062 +MT081063 +MT081064 +MT081066 +MT072667 +MT072688 +MT072668 +MT066158 +MT066157 +MT066159 +MT066175 +MT066176 +LC523807 +LC523808 +LC523809 +MT044258 +MT044257 +MT042777 +MT042778 +MT042776 +MT049951 +MT050414 +MT050415 +MT050417 +MT050416 +MT042774 +MT042775 +MT042773 +MT039887 +MT039888 +MT039890 +MT039873 +LC522350 +MT027062 +MT027063 +MT027064 +MT020781 +MT019530 +MT019531 +MT020881 +MT019533 +MT019529 +MT019532 +MT020880 +LR757995 +LR757996 +LR757997 +LR757998 +MT007544 +MT008023 +MT008022 +MN996530 +MN996531 +MN996527 +MN996528 +MN996529 +MN997409 +MN994468 +MN988668 +MN988669 +MN994467 +MN988713 +MN938387 +MN938389 +MN975263 +MN975268 +MN975267 +MN938388 +MN938390 +MN975264 +MN975265 +MN975266 +MN938386 +MN938385 +MN938384 +MN975262 +MN985325 +MN970003 +MN970004 +NC_045512 +MN908947 |