aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLLTommy2020-04-23 23:04:40 +0200
committerGitHub2020-04-23 23:04:40 +0200
commit96684a1f6bbbbc52c93d96997223f059e3a77d4a (patch)
treef7987ba525b2d1a8dd2f1f00f2dc12f5e4535c6a
parenta1a0e99acaccc95fafeffed489642af28d744d76 (diff)
parent5c2dfddc5c9cedd1fe208ee564cfa916975626e3 (diff)
downloadbh20-seq-resource-96684a1f6bbbbc52c93d96997223f059e3a77d4a.tar.gz
bh20-seq-resource-96684a1f6bbbbc52c93d96997223f059e3a77d4a.tar.lz
bh20-seq-resource-96684a1f6bbbbc52c93d96997223f059e3a77d4a.zip
Merge pull request #34 from AndreaGuarracino/patch-12
new ID, new script release, new dict term
-rw-r--r--scripts/dict_ontology_standardization/ncbi_speciesman_source.csv1
-rwxr-xr-xscripts/from_genbank_to_fasta_and_yaml.py32
-rw-r--r--scripts/sequences.acc227
3 files changed, 248 insertions, 12 deletions
diff --git a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
index 8278c90..f5aeaae 100644
--- a/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
+++ b/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv
@@ -24,3 +24,4 @@ aspirate,http://purl.obolibrary.org/obo/NCIT_C13347
stool,http://purl.obolibrary.org/obo/NCIT_C13234
serum,http://purl.obolibrary.org/obo/NCIT_C13325
saliva,http://purl.obolibrary.org/obo/NCIT_C13275
+nasal swab,http://purl.obolibrary.org/obo/NCIT_C132119
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index f76cb29..5257bd1 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -110,9 +110,26 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
'submitter': {}
}
+
info_for_yaml_dict['sample']['sample_id'] = accession_version
info_for_yaml_dict['sample']['source_database_accession'] = accession_version
- info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq.iter('GBAuthor')])
+
+
+ # submitter info
+ GBSeq_references = GBSeq.find('GBSeq_references')
+ if GBSeq_references is not None:
+ info_for_yaml_dict['submitter']['authors'] = ';'.join([x.text for x in GBSeq_references.iter('GBAuthor')])
+
+ GBReference = GBSeq_references.find('GBReference')
+ if GBReference is not None:
+ GBReference_journal = GBReference.find('GBReference_journal')
+
+ if GBReference_journal is not None and GBReference_journal.text != 'Unpublished':
+ if 'Submitted' in GBReference_journal.text:
+ info_for_yaml_dict['submitter']['submitter_name'] = GBReference_journal.text.split(') ')[1].split(',')[0].strip()
+ info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip()
+ else:
+ info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text
GBSeq_comment = GBSeq.find('GBSeq_comment')
@@ -130,7 +147,7 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
# A regular expression would be better!
try:
info_for_yaml_dict['technology'][field_in_yaml] = float(
- tech_info_to_parse.strip('(average)').strip("reads/nt").replace(',', '.').strip(' xX>'))
+ tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>'))
except ValueError:
print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
pass
@@ -141,7 +158,6 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if seq_tec in term_to_uri_dict:
seq_tec = term_to_uri_dict[seq_tec]
else:
- #print(accession_version, 'missing sample_sequencing_technology:', seq_tec)
missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec]))
new_seq_tec_list.append(seq_tec)
@@ -152,8 +168,6 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
- #term_to_uri_dict
-
for GBFeature in GBSeq.iter('GBFeature'):
if GBFeature.find('GBFeature_key').text != 'source':
continue
@@ -169,8 +183,6 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if GBQualifier_name_text == 'host':
GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
- #info_for_yaml_dict['host']['host_common_name'] = GBQualifier_value_text_list[0] # Removed
-
if GBQualifier_value_text_list[0] in species_to_taxid_dict:
info_for_yaml_dict['host']['host_species'] = species_to_taxid_dict[GBQualifier_value_text_list[0]]
@@ -183,7 +195,6 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
elif GBQualifier_value_text_list[1] in term_to_uri_dict:
info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]
else:
- #print(accession_version, 'missing {}:'.format(GBQualifier_name_text), GBQualifier_value_text_list[1])
missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text_list[1]]))
if 'age' in GBQualifier_value_text:
@@ -211,7 +222,6 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
info_for_yaml_dict['sample']['specimen_source'] = term_to_uri_dict['nasopharyngeal aspirate']
info_for_yaml_dict['sample']['specimen_source2'] = term_to_uri_dict['throat swab']
else:
- #print(accession_version, 'missing specimen_source:', GBQualifier_value_text)
missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
elif GBQualifier_name_text == 'collection_date':
# TO_DO: which format we will use?
@@ -219,12 +229,10 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
elif GBQualifier_name_text in ['lat_lon', 'country']:
if GBQualifier_value_text == 'Hong Kong':
GBQualifier_value_text = 'China: Hong Kong'
-
-
+
if GBQualifier_value_text in term_to_uri_dict:
GBQualifier_value_text = term_to_uri_dict[GBQualifier_value_text]
else:
- #print(accession_version, 'missing {}:'.format(GBQualifier_name_text), GBQualifier_value_text)
missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text]))
info_for_yaml_dict['sample']['collection_location'] = GBQualifier_value_text
diff --git a/scripts/sequences.acc b/scripts/sequences.acc
index 0ad0878..a99c4e6 100644
--- a/scripts/sequences.acc
+++ b/scripts/sequences.acc
@@ -1,4 +1,231 @@
NC_045512
+MT370516
+MT370517
+MT370518
+MT370831
+MT370832
+MT370833
+MT370834
+MT370835
+MT370836
+MT370837
+MT370838
+MT370839
+MT370840
+MT370841
+MT370842
+MT370843
+MT370844
+MT370845
+MT370846
+MT370847
+MT370848
+MT370849
+MT370850
+MT370851
+MT370852
+MT370853
+MT370854
+MT370855
+MT370856
+MT370857
+MT370858
+MT370859
+MT370860
+MT370861
+MT370862
+MT370863
+MT370864
+MT370865
+MT370866
+MT370867
+MT370868
+MT370869
+MT370870
+MT370871
+MT370872
+MT370873
+MT370874
+MT370875
+MT370876
+MT370877
+MT370878
+MT370879
+MT370880
+MT370881
+MT370882
+MT370883
+MT370884
+MT370885
+MT370886
+MT370887
+MT370888
+MT370889
+MT370890
+MT370891
+MT370892
+MT370893
+MT370894
+MT370895
+MT370896
+MT370897
+MT370898
+MT370899
+MT370900
+MT370901
+MT370902
+MT370903
+MT370904
+MT370905
+MT370906
+MT370907
+MT370908
+MT370909
+MT370910
+MT370911
+MT370912
+MT370913
+MT370914
+MT370915
+MT370916
+MT370917
+MT370918
+MT370919
+MT370920
+MT370921
+MT370922
+MT370923
+MT370924
+MT370925
+MT370926
+MT370927
+MT370928
+MT370929
+MT370930
+MT370931
+MT370932
+MT370933
+MT370934
+MT370935
+MT370936
+MT370937
+MT370938
+MT370939
+MT370940
+MT370941
+MT370942
+MT370943
+MT370944
+MT370945
+MT370946
+MT370947
+MT370948
+MT370949
+MT370950
+MT370951
+MT370952
+MT370953
+MT370954
+MT370955
+MT370956
+MT370957
+MT370958
+MT370959
+MT370960
+MT370961
+MT370962
+MT370963
+MT370964
+MT370965
+MT370966
+MT370967
+MT370968
+MT370969
+MT370970
+MT370971
+MT370972
+MT370973
+MT370974
+MT370975
+MT370976
+MT370977
+MT370978
+MT370979
+MT370980
+MT370981
+MT370982
+MT370983
+MT370984
+MT370985
+MT370986
+MT370987
+MT370988
+MT370989
+MT370990
+MT370991
+MT370992
+MT370993
+MT370994
+MT370995
+MT370996
+MT370997
+MT370998
+MT370999
+MT371000
+MT371001
+MT371002
+MT371003
+MT371004
+MT371005
+MT371006
+MT371007
+MT371008
+MT371009
+MT371010
+MT371011
+MT371012
+MT371013
+MT371014
+MT371015
+MT371016
+MT371017
+MT371018
+MT371019
+MT371020
+MT371021
+MT371022
+MT371023
+MT371024
+MT371025
+MT371026
+MT371027
+MT371028
+MT371029
+MT371030
+MT371031
+MT371032
+MT371033
+MT371034
+MT371035
+MT371036
+MT371037
+MT371038
+MT371047
+MT371048
+MT371049
+MT371050
+MT371568
+MT371569
+MT371570
+MT371571
+MT371572
+MT371573
+MT371574
+MT372480
+MT372481
+MT372482
+MT372483
+LC542976
LC542809
MT114412
MT114413