aboutsummaryrefslogtreecommitdiff
path: root/scripts/download_genbank_data
diff options
context:
space:
mode:
authorAndreaGuarracino2020-07-10 13:55:49 +0200
committerAndreaGuarracino2020-07-10 13:55:49 +0200
commit8cb542fdf60273aec7ec107f8bc4896375381263 (patch)
treed7cae4757ac4ac8ef10eda38724d1ffea14b60e1 /scripts/download_genbank_data
parent1655762b516804dad3d71538e95d97d74653c3e9 (diff)
downloadbh20-seq-resource-8cb542fdf60273aec7ec107f8bc4896375381263.tar.gz
bh20-seq-resource-8cb542fdf60273aec7ec107f8bc4896375381263.tar.lz
bh20-seq-resource-8cb542fdf60273aec7ec107f8bc4896375381263.zip
an output file is created with the accessions for which no YAML file is created
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..d5b0ffd 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
+not_created_accession_list = []
accession_with_errors_list = []
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession_version, ' - technology not found')
+ #print(accession_version, ' - technology not found')
+ not_created_accession_list.append([accession_version, 'technology not found'])
continue
with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -400,4 +402,10 @@ if len(accession_with_errors_list) > 0:
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))