From 4299c750728bbad4bdbf0311ff2a4b9c65d9883c Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Thu, 27 Aug 2020 00:18:24 +0200 Subject: updated dependency from clustalw to minimap2; the genbank script no longer creates YAML/FASTA pairs for too short sequences --- .../from_genbank_to_fasta_and_yaml.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'scripts') diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 272b5ba..8ef76e1 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -145,7 +145,7 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x if not os.path.exists(dir_fasta_and_yaml): os.makedirs(dir_fasta_and_yaml) -min_len_to_count = 27500 +min_len_to_count = 15000 num_seq_with_len_ge_X_bp = 0 missing_value_list = [] @@ -411,18 +411,17 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) not_created_accession_dict[accession_version] = [] not_created_accession_dict[accession_version].append('host_species not found') - if accession_version in not_created_accession_dict: - continue - - with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: - fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) + if len(GBSeq_sequence.text) < min_len_to_count: + not_created_accession_dict[accession_version].append('sequence shorter than {} bp'.format(min_len_to_count)) - with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: - json.dump(info_for_yaml_dict, fw, indent=2) + if accession_version not in not_created_accession_dict: + num_seq_with_len_ge_X_bp += 1 + with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: + fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper())) - if(len(GBSeq_sequence.text) >= min_len_to_count): - num_seq_with_len_ge_X_bp += 1 + with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw: + json.dump(info_for_yaml_dict, fw, indent=2) except: print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0])) accession_with_errors_list.append(accession_version) -- cgit v1.2.3