about summary refs log tree commit diff
path: root/scripts/download_genbank_data
diff options
context:
space:
mode:
authorPjotr Prins2020-07-17 11:08:15 +0100
committerPjotr Prins2020-07-17 11:08:15 +0100
commit16bb5df907c79cd0ce6bea0015821a2ce51fb992 (patch)
treeddb9677cddcc463bb514300189cbd4300b9117ed /scripts/download_genbank_data
parent0be9983ef88fd3b925d8fa53e7f9ab2a28703bc0 (diff)
parentc69046ee9a5e24eadcd8cb885633328b0fd88011 (diff)
downloadbh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.gz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.tar.lz
bh20-seq-resource-16bb5df907c79cd0ce6bea0015821a2ce51fb992.zip
Merge branch 'master' into ebi-submit
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py14
1 files changed, 11 insertions, 3 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
 num_seq_with_len_ge_X_bp = 0
 
 missing_value_list = []
+not_created_accession_list = []
 accession_with_errors_list = []
 
 for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
 
             if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-                print(accession_version, ' - technology not found')
+                #print(accession_version, ' - technology not found')
+                not_created_accession_list.append([accession_version, 'technology not found'])
                 continue
 
             with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -389,15 +391,21 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
             continue
 
 if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
+    path_missing_terms_tsv = 'missing_terms.genbank.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
 
 if len(accession_with_errors_list) > 0:
-    path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+    path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
     print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
     with open(path_accession_with_errors_tsv, 'w') as fw:
         fw.write('\n'.join(accession_with_errors_list))
 
+if len(not_created_accession_list) > 0:
+    path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
+    print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+    with open(path_not_created_accession_tsv, 'w') as fw:
+        fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
 print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))