aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlltommy2020-04-15 07:40:56 +0200
committerlltommy2020-04-15 07:40:56 +0200
commit7b7811a07a7c2adbba51032765a7cccbaecce5d9 (patch)
treeb51783b9aa7f797974f725dcac35a08d1c85fa19
parente0f52b36ce314e1a2bcd048462794427bb44ec45 (diff)
parent3e613390016c3b01f7d44914cb9c2718fff8b9ec (diff)
downloadbh20-seq-resource-7b7811a07a7c2adbba51032765a7cccbaecce5d9.tar.gz
bh20-seq-resource-7b7811a07a7c2adbba51032765a7cccbaecce5d9.tar.lz
bh20-seq-resource-7b7811a07a7c2adbba51032765a7cccbaecce5d9.zip
Merge branch 'master' of https://github.com/arvados/bh20-seq-resource
-rw-r--r--scripts/from_genbank_to_fasta_and_yaml.py141
-rw-r--r--scripts/sequences.acc877
2 files changed, 1018 insertions, 0 deletions
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
new file mode 100644
index 0000000..0cc1a57
--- /dev/null
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -0,0 +1,141 @@
+from Bio import Entrez
+Entrez.email = 'your_email_to_be_polite'
+
+import xml.etree.ElementTree as ET
+import yaml
+import os
+
+path_ncbi_virus_accession = 'sequences.acc'
+
+date = '20200414'
+path_seq_fasta = 'seq_from_nuccore.{}.fasta'.format(date)
+path_metadata_xml = 'metadata_from_nuccore.{}.xml'.format(date)
+
+# Take all the ids
+id_set = set()
+
+term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
+for term in term_list:
+ tmp_list = Entrez.read(
+ Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000')
+ )['IdList']
+ print(term, len(tmp_list))
+
+ # Remove the version in the id
+ id_set.update([x.split('.')[0] for x in tmp_list])
+
+print(term_list, len(id_set))
+
+with open(path_ncbi_virus_accession) as f:
+ tmp_list = [line.strip('\n') for line in f]
+
+print('NCBI Virus', len(tmp_list))
+id_set.update(tmp_list)
+
+print(term_list + ['NCBI Virus'], len(id_set))
+
+if not os.path.exists(path_metadata_xml):
+ # TO_DO: to check if I already have the records?
+
+ with open(path_metadata_xml, 'w') as fw:
+ fw.write(
+ Entrez.efetch(db='nuccore', id=list(id_set), retmode='xml').read()
+ )
+
+
+tree = ET.parse(path_metadata_xml)
+GBSet = tree.getroot()
+
+species_to_taxid_dict = {
+ 'Homo sapiens': 9606
+}
+
+for GBSeq in GBSet:
+ accession_version = GBSeq.find('GBSeq_accession-version').text
+
+ GBSeq_sequence = GBSeq.find('GBSeq_sequence')
+ if GBSeq_sequence is None:
+ print(accession_version, ' - sequence not found')
+ continue
+
+
+ # A general default-empty yaml could be read from the definitive one
+ info_for_yaml_dict = {
+ 'id': 'placeholder',
+ 'host': {},
+ 'sample': {},
+ 'virus': {},
+ 'technology': {},
+ 'submitter': {}
+ }
+
+
+ info_for_yaml_dict['sample']['sample_id'] = accession_version
+ info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
+
+
+ GBSeq_comment = GBSeq.find('GBSeq_comment')
+ if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text:
+ GBSeq_comment_text = GBSeq_comment.text.split('##Assembly-Data-START## ; ')[1].split(' ; ##Assembly-Data-END##')[0]
+
+ for info_to_check, field_in_yaml in zip(
+ ['Assembly Method', 'Coverage', 'Sequencing Technology'],
+ ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology']
+ ):
+ if info_to_check in GBSeq_comment_text:
+ info_for_yaml_dict['technology'][field_in_yaml] = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
+
+
+ for GBFeature in GBSeq.iter('GBFeature'):
+ if GBFeature.find('GBFeature_key').text != 'source':
+ continue
+
+ for GBQualifier in GBFeature.iter('GBQualifier'):
+ GBQualifier_value = GBQualifier.find('GBQualifier_value')
+ if GBQualifier_value is None:
+ continue
+ GBQualifier_value_text = GBQualifier_value.text
+
+ GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
+
+ if GBQualifier_name_text == 'host':
+ GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
+
+ info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0]
+
+ if GBQualifier_value_text_list[0] in species_to_taxid_dict:
+ info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
+
+ if len(GBQualifier_value_text_list) > 1:
+ if GBQualifier_value_text_list[1] in ['male', 'female']:
+ info_for_yaml_dict['host']['host_sex'] = GBQualifier_value_text_list[1]
+ else:
+ info_for_yaml_dict['host']['host_health_status'] = GBQualifier_value_text_list[1]
+
+ if 'age' in GBQualifier_value_text:
+ info_for_yaml_dict['host']['host_age'] = int(GBQualifier_value_text_list[2].split('age ')[1])
+ info_for_yaml_dict['host']['host_age_unit'] = 'year'
+ elif GBQualifier_name_text == 'collected_by':
+ if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]):
+ info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text
+ else:
+ info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text
+ elif GBQualifier_name_text == 'isolation_source':
+ info_for_yaml_dict['sample']['specimen_source'] = GBQualifier_value_text
+ elif GBQualifier_name_text == 'collection_date':
+ # TO_DO: which format we will use?
+ info_for_yaml_dict['sample']['collection_date'] = GBQualifier_value_text
+ elif GBQualifier_name_text in ['lat_lon', 'country']:
+ info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text
+ elif GBQualifier_name_text == 'note':
+ info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text
+ elif GBQualifier_name_text == 'isolate':
+ info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
+ elif GBQualifier_name_text == 'db_xref':
+ info_for_yaml_dict['virus']['virus_species'] = int(GBQualifier_value_text.split('taxon:')[1])
+
+ with open('{}.fasta'.format(accession_version), 'w') as fw:
+ fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
+
+ with open('{}.yaml'.format(accession_version), 'w') as fw:
+ yaml.dump(info_for_yaml_dict, fw, default_flow_style=False)
diff --git a/scripts/sequences.acc b/scripts/sequences.acc
new file mode 100644
index 0000000..62bde2c
--- /dev/null
+++ b/scripts/sequences.acc
@@ -0,0 +1,877 @@
+MT325599
+MT325601
+MT325602
+MT325607
+MT325608
+MT325609
+MT325610
+MT325612
+MT325616
+MT325617
+MT325618
+MT325622
+MT325623
+MT325600
+MT325606
+MT325611
+MT325613
+MT325615
+MT325619
+MT325620
+MT325624
+MT325625
+MT325565
+MT325566
+MT326147
+MT326153
+MT326173
+MT326174
+MT326176
+MT326186
+MT326023
+MT326026
+MT326027
+MT326034
+MT326039
+MT326042
+MT326045
+MT326047
+MT326051
+MT326054
+MT326060
+MT326061
+MT326064
+MT326065
+MT326068
+MT326072
+MT326076
+MT326078
+MT326079
+MT325590
+MT325640
+MT326130
+MT326129
+MT326128
+MT326121
+MT326119
+MT326109
+MT326100
+MT325568
+MT324679
+MT325561
+MT325571
+MT325585
+MT325587
+MT325588
+MT325589
+MT325596
+MT325597
+MT325603
+MT325614
+MT325621
+MT325629
+MT325630
+MT325638
+MT325639
+MT326086
+MT326087
+MT326102
+MT326104
+MT326105
+MT326123
+MT328033
+MT328034
+MT325562
+MT325564
+MT325567
+MT326154
+MT326155
+MT326156
+MT326157
+MT326163
+MT326165
+MT326175
+MT326177
+MT326184
+MT326185
+MT326187
+MT325572
+MT325575
+MT325583
+MT325584
+MT325604
+MT325631
+MT325632
+MT325635
+MT325636
+MT325637
+MT326095
+MT326096
+MT326103
+MT326112
+MT326113
+MT326114
+MT326115
+MT326122
+MT326131
+MT326132
+MT326133
+MT325563
+MT326164
+MT326166
+MT326167
+MT325569
+MT326097
+MT326106
+MT326107
+MT326116
+MT326117
+MT326124
+MT326125
+MT326126
+MT326127
+MT326134
+MT326135
+MT326136
+MT326137
+MT326138
+MT326139
+MT326140
+MT326141
+MT326142
+MT326143
+MT326144
+MT326145
+MT326146
+MT326148
+MT326149
+MT326150
+MT326151
+MT326152
+MT326158
+MT326159
+MT326160
+MT326161
+MT326162
+MT326168
+MT326169
+MT326170
+MT326171
+MT326172
+MT326178
+MT326179
+MT326180
+MT326181
+MT326182
+MT326183
+MT326188
+MT326189
+MT326190
+MT326191
+MT326120
+MT326118
+MT326111
+MT326110
+MT326108
+MT326101
+MT326099
+MT326098
+MT326094
+MT326093
+MT326092
+MT326091
+MT326090
+MT326085
+MT326084
+MT326083
+MT326082
+MT326081
+MT326080
+MT326077
+MT326067
+MT326057
+MT326024
+MT326025
+MT326032
+MT326033
+MT326035
+MT326036
+MT326037
+MT326040
+MT326041
+MT326043
+MT326044
+MT326046
+MT326049
+MT326050
+MT326052
+MT326053
+MT326055
+MT326056
+MT326059
+MT326062
+MT326063
+MT326066
+MT326069
+MT326070
+MT326071
+MT326073
+MT326074
+MT326075
+MT326088
+MT326089
+MT327745
+MT324062
+MT324680
+MT324684
+MT325573
+MT325574
+MT325576
+MT325577
+MT325578
+MT325580
+MT325591
+MT325592
+MT325593
+MT325595
+MT325605
+MT325627
+MT326028
+MT326029
+MT326031
+MT326048
+MT325570
+MT325579
+MT325581
+MT325582
+MT325586
+MT325594
+MT325598
+MT325626
+MT325628
+MT325633
+MT325634
+MT326030
+MT326038
+MT326058
+MT324681
+MT324682
+MT324683
+MT328032
+MT328035
+MT039874
+MT077125
+MT322394
+MT322397
+MT322398
+MT322399
+MT322400
+MT322401
+MT322403
+MT322404
+MT322405
+MT322406
+MT322408
+MT322409
+MT322410
+MT322411
+MT322412
+MT322413
+MT322414
+MT322415
+MT322416
+MT322417
+MT322418
+MT322419
+MT322420
+MT322421
+MT322422
+MT322423
+MT322424
+MT322396
+MT322402
+MT322395
+MT322407
+MT320538
+MT320891
+MT308692
+MT308693
+MT308698
+MT308699
+MT308703
+MT308704
+MT308694
+MT308695
+MT308696
+MT308697
+MT308700
+MT308701
+MT308702
+MT304476
+MT304474
+MT304475
+MT293547
+MT304477
+MT304483
+MT300186
+MT304478
+MT304479
+MT304480
+MT304481
+MT304482
+MT304484
+MT304485
+MT304486
+MT304487
+MT304488
+MT304489
+MT304490
+MT304491
+MT291831
+MT291836
+MT291834
+MT291835
+MT292570
+MT293173
+MT292574
+MT293179
+MT293181
+MT293183
+MT293195
+MT293196
+MT293201
+MT293204
+MT291829
+MT291830
+MT291827
+MT292572
+MT292577
+MT293186
+MT293187
+MT293188
+MT292580
+MT292581
+MT292571
+MT292576
+MT292578
+MT293185
+MT293160
+MT293161
+MT293199
+MT292579
+MT291828
+MT293166
+MT293167
+MT293168
+MT293175
+MT293190
+MT293191
+MT273658
+MT293159
+MT292582
+MT293162
+MT293163
+MT293164
+MT293165
+MT293156
+MT293157
+MT293158
+MT281577
+MT293171
+MT293174
+MT293176
+MT293182
+MT293210
+MT293211
+MT293217
+MT293218
+MT295465
+MT293213
+MT293221
+MT295464
+MT292569
+MT293169
+MT293172
+MT293177
+MT293200
+MT293198
+MT293205
+MT293207
+MT293212
+MT293216
+MT293219
+MT293222
+MT293224
+MT293225
+MT293206
+MT293208
+MT293209
+MT293214
+MT293215
+MT293220
+MT293170
+MT292573
+MT293178
+MT292575
+MT293180
+MT293184
+MT293189
+MT293192
+MT293193
+MT293194
+MT293197
+MT293202
+MT293203
+MT293223
+MT291826
+MT291832
+MT291833
+MT281530
+MT276331
+MT276325
+MT276324
+MT276326
+MT276327
+MT276330
+MT276329
+MT276597
+MT276598
+MT276323
+MT276328
+MT262896
+MT263385
+MT263392
+MT262900
+MT262901
+MT262902
+MT262909
+MT262911
+MT262912
+MT263382
+MT263383
+MT263384
+MT263423
+MT263431
+MT263421
+MT263443
+MT263461
+MT263420
+MT263429
+MT263434
+MT263435
+MT263437
+MT263445
+MT263428
+MT263433
+MT263436
+MT263438
+MT263440
+MT263444
+MT263452
+MT263455
+MT263456
+MT263462
+MT263463
+MT263466
+MT263446
+MT263447
+MT263448
+MT263449
+MT263451
+MT263453
+MT263388
+MT263391
+MT262903
+MT262906
+MT262907
+MT262908
+MT262913
+MT262914
+MT263390
+MT263398
+MT263403
+MT263430
+MT263399
+MT263404
+MT263405
+MT263414
+MT263389
+MT263393
+MT263394
+MT263395
+MT263396
+MT263397
+MT263402
+MT263469
+MT263441
+MT263454
+MT263467
+MT263465
+MT263468
+MT263439
+MT263457
+MT263460
+MT263450
+MT263458
+MT263459
+MT263464
+MT263386
+MT263387
+MT262897
+MT262899
+MT262904
+MT262905
+MT262910
+MT263408
+MT263412
+MT263416
+MT263417
+MT263422
+MT263432
+MT263419
+MT263424
+MT263427
+MT263442
+MT263413
+MT263418
+MT263425
+MT263401
+MT263409
+MT263410
+MT263411
+MT263426
+MT263406
+MT263407
+MT263415
+MT262993
+MT263074
+MT263381
+MT262898
+MT262915
+MT262916
+MT263400
+MT259257
+MT259261
+MT259262
+MT259263
+MT259264
+MT259268
+MT259269
+MT259270
+MT259271
+MT259272
+MT259273
+MT259275
+MT259276
+MT259277
+MT259279
+MT259244
+MT259245
+MT258381
+MT258377
+MT258379
+MT259226
+MT259281
+MT259282
+MT259283
+MT258378
+MT259231
+MT259274
+MT259286
+MT256917
+MT259227
+MT259238
+MT258382
+MT259246
+MT259253
+MT259254
+MT259255
+MT259259
+MT259265
+MT259284
+MT259252
+MT259229
+MT259230
+MT259260
+MT259285
+MT259278
+MT259280
+MT259247
+MT259240
+MT259243
+MT259249
+MT259250
+MT259251
+MT259256
+MT259258
+MT259266
+MT259267
+MT259287
+MT259241
+MT259242
+MT259228
+MT259236
+MT258383
+MT259248
+MT256918
+MT258380
+MT259235
+MT259237
+MT259239
+MT256924
+LC534419
+LC534418
+MT253704
+MT253710
+MT253701
+MT253702
+MT253703
+MT253706
+MT253707
+MT251972
+MT251973
+MT251975
+MT251976
+MT251978
+MT251979
+MT251977
+MT251980
+MT253696
+MT253697
+MT253698
+MT253699
+MT251974
+MT253700
+MT253705
+MT253709
+MT253708
+MT233526
+MT246667
+MT246451
+MT246453
+MT246454
+MT246461
+MT246462
+MT246490
+MT246450
+MT246452
+MT246464
+MT246470
+MT246474
+MT246480
+MT246481
+MT246482
+MT246457
+MT246459
+MT246466
+MT246489
+MT246456
+MT246458
+MT246475
+MT246476
+MT246477
+MT246479
+MT246487
+MT246449
+MT246455
+MT246468
+MT246469
+MT246486
+MT246488
+MT246467
+MT246478
+MT246485
+MT246460
+MT246463
+MT246465
+MT246471
+MT246472
+MT246473
+MT246483
+MT246484
+MT240479
+MT232869
+MT232870
+MT233522
+MT232871
+MT232872
+MT233520
+MT233523
+MT233519
+MT233521
+MT226610
+MT198653
+MT198651
+MT198652
+MT192759
+MT192765
+MT192772
+MT192773
+MT192758
+MT188341
+MT188339
+MT188340
+MT186680
+MT186676
+MT186677
+MT186679
+MT186678
+MT187977
+MT186681
+MT186682
+MT184912
+MT184910
+MT184911
+MT184913
+MT184909
+MT184907
+MT184908
+CADDYA000000000
+MT163712
+MT163716
+MT163719
+MT163720
+MT163715
+MT163721
+MT163714
+MT163717
+MT163737
+MT163738
+MT163718
+MT159706
+MT159707
+MT159717
+MT159716
+MT159719
+MT159709
+MT159710
+MT159712
+MT159713
+MT159714
+MT159722
+MT159711
+MT159715
+MT159718
+MT159720
+MT159721
+MT159708
+MT121215
+MT159778
+MT159705
+MT161607
+MT066156
+LC529905
+MT050493
+MT012098
+MT152824
+MT152900
+MT135043
+MT135042
+MT135041
+MT135044
+MT127116
+MT127113
+MT127114
+MT127115
+MT126808
+LC528233
+LC528232
+MT123290
+MT123291
+MT123292
+MT123293
+MT118835
+MT111896
+MT111895
+MT106052
+MT106053
+MT106054
+MT093571
+MT093631
+MT081059
+MT081068
+MT081060
+MT081061
+MT081065
+MT081067
+MT081062
+MT081063
+MT081064
+MT081066
+MT072667
+MT072688
+MT072668
+MT066158
+MT066157
+MT066159
+MT066175
+MT066176
+LC523807
+LC523808
+LC523809
+MT044258
+MT044257
+MT042777
+MT042778
+MT042776
+MT049951
+MT050414
+MT050415
+MT050417
+MT050416
+MT042774
+MT042775
+MT042773
+MT039887
+MT039888
+MT039890
+MT039873
+LC522350
+MT027062
+MT027063
+MT027064
+MT020781
+MT019530
+MT019531
+MT020881
+MT019533
+MT019529
+MT019532
+MT020880
+LR757995
+LR757996
+LR757997
+LR757998
+MT007544
+MT008023
+MT008022
+MN996530
+MN996531
+MN996527
+MN996528
+MN996529
+MN997409
+MN994468
+MN988668
+MN988669
+MN994467
+MN988713
+MN938387
+MN938389
+MN975263
+MN975268
+MN975267
+MN938388
+MN938390
+MN975264
+MN975265
+MN975266
+MN938386
+MN938385
+MN938384
+MN975262
+MN985325
+MN970003
+MN970004
+NC_045512
+MN908947