From 1430c62ff9245bfecb1d41cc87bbafafcfc81ca3 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 4 Sep 2020 10:37:35 +0200 Subject: sra script updated for managing more locations --- scripts/create_sra_metadata/create_sra_metadata.py | 8 ++++++-- scripts/dict_ontology_standardization/ncbi_countries.csv | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index 2a05d26..09cc51b 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -85,6 +85,8 @@ not_created_accession_dict = {} run_accession_set = set() run_accession_to_downloadble_file_url_dict = {} +num_yaml_created = 0 + for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): #print(i, EXPERIMENT_PACKAGE) @@ -209,7 +211,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) else: info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) - elif TAG_text == 'geo_loc_name': + elif TAG_text in ['geo_loc_name', 'geographic location (country and/or sea)', 'geographic location (region and locality)']: if ': ' in VALUE_text: VALUE_text = VALUE_text.replace(': ', ':') @@ -301,6 +303,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): not_created_accession_dict[accession].append('host_species not found') if accession not in not_created_accession_dict: + num_yaml_created += 1 + with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) @@ -316,4 +320,4 @@ if len(not_created_accession_dict) > 0: with open(path_not_created_accession_tsv, 'w') as fw: fw.write('\n'.join(['\t'.join([accession_version, ','.join(missing_info_list)]) for accession_version, missing_info_list in not_created_accession_dict.items()])) - +print('Num. YAML files created: {}'.format(num_yaml_created)) diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv index 400d732..58a64e3 100644 --- a/scripts/dict_ontology_standardization/ncbi_countries.csv +++ b/scripts/dict_ontology_standardization/ncbi_countries.csv @@ -708,6 +708,7 @@ USA:WY,http://www.wikidata.org/entity/Q1214 Uzbekistan,http://www.wikidata.org/entity/Q265 Vanuatu,http://www.wikidata.org/entity/Q686 Vatican City,http://www.wikidata.org/entity/Q237 +Wales,http://www.wikidata.org/entity/Q25 Venezuela,http://www.wikidata.org/entity/Q717 Viet nam,http://www.wikidata.org/entity/Q881 Viet Nam,http://www.wikidata.org/entity/Q881 -- cgit v1.2.3