aboutsummaryrefslogtreecommitdiff
path: root/scripts/create_sra_metadata/create_sra_metadata.py
blob: ef0d1191b347ecda5acca416c63b54e30948d435 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python3

import os
from dateutil.parser import parse
import xml.etree.ElementTree as ET
import json
import gzip

dir_yaml = 'yaml'

date = '2020.07.05'

# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
# Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
#         -> Send to -> File -> Full XML -> Create File
path_sra_metadata_xml = 'SraExperimentPackage.{}.xml.gz'.format(date)

dir_dict_ontology_standardization = '../dict_ontology_standardization/'
path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date)

term_to_uri_dict = {}

for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
    print('Read {}'.format(path_dict_xxx_csv))

    with open(path_dict_xxx_csv, 'r') as f:
        for line in f:
            if len(line.split(',')) > 2:
                term, uri = line.strip('\n').split('",')
                term = term.strip('"')
            else:
                term, uri = line.strip('\n').split(',')

            term_to_uri_dict[term] = uri

def is_integer(string_to_check):
    try:
        int(string_to_check)
        return True
    except ValueError:
        return False

if not os.path.exists(dir_yaml):
    os.makedirs(dir_yaml)

sra_metadata_xml_file = gzip.open(path_sra_metadata_xml, 'r')
tree = ET.parse(sra_metadata_xml_file)
sra_metadata_xml_file.close()

EXPERIMENT_PACKAGE_SET = tree.getroot()

missing_value_list = []

run_accession_set = set()
run_accession_to_downloadble_file_url_dict = {}

for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
    #print(i, EXPERIMENT_PACKAGE)
    
    # A general default-empty yaml could be read from the definitive one
    info_for_yaml_dict = {
        'id': 'placeholder',
        'host': {},
        'sample': {},
        'virus': {},
        'technology': {},
        'submitter': {}
    }

    RUN_SET = EXPERIMENT_PACKAGE.find('RUN_SET')
    RUN = RUN_SET.find('RUN')
    accession = RUN.attrib['accession']
    run_accession_set.add(accession)
    #print(accession)

    info_for_yaml_dict['sample']['sample_id'] = accession
    
    #SRAFiles = RUN.find('SRAFiles')
    #if SRAFiles is not None:
    #    url = SRAFiles.find('SRAFile').attrib['url']
    #    if 'sra-download.ncbi.nlm.nih.gov' in url:
    #        run_accession_to_downloadble_file_url_dict[accession] = url
  

    SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
    SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
    
    for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
        VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
        if VALUE is not None:
            TAG_text = SAMPLE_ATTRIBUTE.find('TAG').text
            VALUE_text = VALUE.text

            if TAG_text in ['host', 'host scientific name']:
                if VALUE_text.lower() in ['homo sapien', 'homosapiens']:
                    VALUE_text = 'Homo sapiens'

                if VALUE_text in term_to_uri_dict:
                    info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[VALUE_text]
                else:
                    missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
            elif TAG_text in ['host_health_status', 'host health state']:
                if VALUE_text in term_to_uri_dict:
                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]                            
                elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
                    missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
            elif TAG_text in ['strain', 'isolate']:
                if VALUE_text.lower() not in ['not applicable', 'missing', 'na', 'unknown', 'not provided']:
                    value_to_insert = VALUE_text

                    if value_to_insert.lower() in ['homo sapien', 'homosapiens']:
                        value_to_insert = 'Homo sapiens'

                    if value_to_insert in term_to_uri_dict:
                        value_to_insert = term_to_uri_dict[value_to_insert]
                        
                    if 'virus_strain' not in info_for_yaml_dict:                        
                        info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
                    else:
                        info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
            elif TAG_text in ['isolation_source', 'isolation source host-associated']:                    
                if VALUE_text in term_to_uri_dict:
                    info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
                else:
                    if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
                    elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
                    elif VALUE_text.lower() in ['nasal swab and throat swab']:
                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
                    elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
                    elif VALUE_text.strip("'") not in ['missing', 'not collected', 'unknown', 'not provided', 'not applicable', 'N/A']:
                        missing_value_list.append('\t'.join([accession, 'specimen_source', VALUE_text]))
            elif TAG_text in ['host_sex', 'host sex']:
                if VALUE_text.lower() not in ['missing', 'not provided']:
                    if VALUE_text in ['male', 'female']:
                        info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if VALUE_text == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
                    else:
                        missing_value_list.append('\t'.join([accession, 'host_sex', VALUE_text]))
            elif TAG_text in ['host_age', 'host age']:
                if is_integer(VALUE_text):
                    info_for_yaml_dict['host']['host_age'] = VALUE_text
                    info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036'
            elif TAG_text == 'collected_by':
                if VALUE_text.lower() not in ['not available', 'missing']:
                    name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
                    
                    info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
            elif TAG_text == 'collecting institution':
                if VALUE_text.lower() not in ['not provided', 'na']:
                    info_for_yaml_dict['sample']['collecting_institution'] = VALUE_text
            elif TAG_text in ['collection_date', 'collection date']:
                if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
                    date_to_write = VALUE_text
                    date_is_estimated = True
                    
                    VALUE_text_list = VALUE_text.split('-')
                    if len(VALUE_text_list) == 3:
                        date_is_estimated = False
                        
                        if VALUE_text_list[1].isalpha():
                            date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
                    elif len(VALUE_text_list) == 2:
                        date_to_write =  VALUE_text + '-15'
                    else:
                        if int(VALUE_text) < 2020:
                            date_to_write = "{}-12-15".format(VALUE_text)
                        else:
                            date_to_write = "{}-01-15".format(VALUE_text)

                    info_for_yaml_dict['sample']['collection_date'] = date_to_write
                    
                    if date_is_estimated:
                        if 'additional_collection_information' in info_for_yaml_dict['sample']:
                            info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
                        else:
                            info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
            elif TAG_text == 'geo_loc_name':
                if VALUE_text in term_to_uri_dict:
                    info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text]
                elif VALUE_text.lower() not in ['na', 'not applicable']:
                    missing_value_list.append('\t'.join([accession, 'geo_loc_name', VALUE_text]))
            #else:
            #    if TAG_text not in ['lat_lon', 'host_disease', 'BioSampleModel', 'passage_history']:
            #        print(accession, TAG_text, VALUE_text)


    taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
    info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
    
    
    EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
    INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]

    if INSTRUMENT_MODEL.lower() != 'unspecified':
        if INSTRUMENT_MODEL in term_to_uri_dict:
            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [term_to_uri_dict[INSTRUMENT_MODEL]]
        else:
            missing_value_list.append('\t'.join([accession, 'sample_sequencing_technology', INSTRUMENT_MODEL]))
    #else:
    #    print(accession, 'Missing INSTRUMENT_MODEL', info_for_yaml_dict)
    LIBRARY_DESCRIPTOR = EXPERIMENT.find('DESIGN').find('LIBRARY_DESCRIPTOR')
    if LIBRARY_DESCRIPTOR.text not in ['OTHER']:
        info_for_yaml_dict['technology']['additional_technology_information'] = 'LIBRARY_STRATEGY: {};'.format(LIBRARY_DESCRIPTOR.find('LIBRARY_STRATEGY').text)

    SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
    info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
    
    if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
        info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']

    STUDY = EXPERIMENT_PACKAGE.find('STUDY')     
    info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
    
    
    Organization = EXPERIMENT_PACKAGE.find('Organization')
    Organization_Name = Organization.find('Name')
    info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
        
    Organization_Contact = Organization.find('Contact')
    if Organization_Contact is not None:
        Organization_Contact_Name = Organization_Contact.find('Name')
        info_for_yaml_dict['submitter']['submitter_name'] = [Organization_Contact_Name.find('First').text + ' ' + Organization_Contact_Name.find('Last').text]
        info_for_yaml_dict['submitter']['additional_submitter_information'] = Organization_Contact.attrib['email']

        Organization_Concact_Address = Organization_Contact.find('Address')
        if Organization_Concact_Address is not None:
            info_for_yaml_dict['submitter']['submitter_address'] = '; '.join([x.text for x in Organization_Concact_Address] + ['Postal code ' + Organization_Concact_Address.attrib['postal_code']])

    Organization_Address = Organization.find('Address')
    if Organization_Address is not None:
        info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
    
    if 'collection_date' not in info_for_yaml_dict['sample']:
        info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
        info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"

    if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
        print(accession, ' - technology not found')
        continue

    with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
        json.dump(info_for_yaml_dict, fw, indent=2)
    
if len(missing_value_list) > 0:
    path_missing_terms_tsv = 'missing_terms.tsv'
    print('Written missing terms in {}'.format(path_missing_terms_tsv))
    with open(path_missing_terms_tsv, 'w') as fw:
        fw.write('\n'.join(missing_value_list))