From 8cb542fdf60273aec7ec107f8bc4896375381263 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 10 Jul 2020 13:55:49 +0200 Subject: an output file is created with the accessions for which no YAML file is created --- .../download_genbank_data/from_genbank_to_fasta_and_yaml.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'scripts/download_genbank_data') diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 39e401a..d5b0ffd 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -138,6 +138,7 @@ min_len_to_count = 27500 num_seq_with_len_ge_X_bp = 0 missing_value_list = [] +not_created_accession_list = [] accession_with_errors_list = [] for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: @@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: - print(accession_version, ' - technology not found') + #print(accession_version, ' - technology not found') + not_created_accession_list.append([accession_version, 'technology not found']) continue with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: @@ -400,4 +402,10 @@ if len(accession_with_errors_list) > 0: with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) +if len(not_created_accession_list) > 0: + path_not_created_accession_tsv = 'not_created_accession.tsv' + print('Written not created accession in {}'.format(path_not_created_accession_tsv)) + with open(path_not_created_accession_tsv, 'w') as fw: + fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) + print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) -- cgit v1.2.3 From bb90f06da570624952d4b7001ee37fc7018e3a7d Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 12 Jul 2020 15:58:29 +0200 Subject: added a suffix to distinguish which script created the error/warning files --- scripts/create_sra_metadata/create_sra_metadata.py | 4 ++-- scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'scripts/download_genbank_data') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index a31bd36..352a30e 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -251,13 +251,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): json.dump(info_for_yaml_dict, fw, indent=2) if len(missing_value_list) > 0: - path_missing_terms_tsv = 'missing_terms.tsv' + path_missing_terms_tsv = 'missing_terms.sra.tsv' print('Written missing terms in {}'.format(path_missing_terms_tsv)) with open(path_missing_terms_tsv, 'w') as fw: fw.write('\n'.join(missing_value_list)) if len(not_created_accession_list) > 0: - path_not_created_accession_tsv = 'not_created_accession.tsv' + path_not_created_accession_tsv = 'not_created_accession.sra.tsv' print('Written not created accession in {}'.format(path_not_created_accession_tsv)) with open(path_not_created_accession_tsv, 'w') as fw: fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index d5b0ffd..dbebfbb 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -391,19 +391,19 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) continue if len(missing_value_list) > 0: - path_missing_terms_tsv = 'missing_terms.tsv' + path_missing_terms_tsv = 'missing_terms.genbank.tsv' print('Written missing terms in {}'.format(path_missing_terms_tsv)) with open(path_missing_terms_tsv, 'w') as fw: fw.write('\n'.join(missing_value_list)) if len(accession_with_errors_list) > 0: - path_accession_with_errors_tsv = 'accession_with_errors.tsv' + path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv' print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv)) with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) if len(not_created_accession_list) > 0: - path_not_created_accession_tsv = 'not_created_accession.tsv' + path_not_created_accession_tsv = 'not_created_accession.genbank.tsv' print('Written not created accession in {}'.format(path_not_created_accession_tsv)) with open(path_not_created_accession_tsv, 'w') as fw: fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) -- cgit v1.2.3