aboutsummaryrefslogtreecommitdiff
path: root/scripts/download_genbank_data
diff options
context:
space:
mode:
authorAndreaGuarracino2020-08-25 00:02:32 +0200
committerAndreaGuarracino2020-08-25 00:02:32 +0200
commit2baa88b766ec540bd34b96599014dd16e393af39 (patch)
tree96ecbcedc36a427f03281cf27389e5ee92af8648 /scripts/download_genbank_data
parent5c44403e4516b6c809ecde1398b306bbde2c6727 (diff)
downloadbh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.tar.gz
bh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.tar.lz
bh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.zip
the YAML/FASTA pair is not created for samples where at least one mandatory field is missing
Diffstat (limited to 'scripts/download_genbank_data')
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py34
1 files changed, 29 insertions, 5 deletions
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index f314a1d..eefdddb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -139,7 +139,7 @@ min_len_to_count = 27500
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
-not_created_accession_list = []
+not_created_accession_dict = {}
accession_with_errors_list = []
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -374,10 +374,34 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
elif GBQualifier_name_text == 'db_xref':
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
-
+ # Check if mandatory fields are missing
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
#print(accession_version, ' - technology not found')
- not_created_accession_list.append([accession_version, 'technology not found'])
+ if accession_version not in not_created_accession_dict:
+ not_created_accession_dict[accession_version] = []
+ not_created_accession_dict[accession_version].append('sample_sequencing_technology not found')
+
+ if 'collection_location' not in info_for_yaml_dict['sample']:
+ if accession_version not in not_created_accession_dict:
+ not_created_accession_dict[accession_version] = []
+ not_created_accession_dict[accession_version].append('collection_location not found')
+
+ if 'collection_date' not in info_for_yaml_dict['sample']:
+ if accession_version not in not_created_accession_dict:
+ not_created_accession_dict[accession_version] = []
+ not_created_accession_dict[accession_version].append('collection_date not found')
+
+ if 'authors' not in info_for_yaml_dict['submitter']:
+ if accession_version not in not_created_accession_dict:
+ not_created_accession_dict[accession_version] = []
+ not_created_accession_dict[accession_version].append('authors not found')
+
+ if 'host_species' not in info_for_yaml_dict['host']:
+ if accession_version not in not_created_accession_dict:
+ not_created_accession_dict[accession_version] = []
+ not_created_accession_dict[accession_version].append('host_species not found')
+
+ if accession_version in not_created_accession_dict:
continue
with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -406,10 +430,10 @@ if len(accession_with_errors_list) > 0:
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
-if len(not_created_accession_list) > 0:
+if len(not_created_accession_dict) > 0:
path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
print('Written not created accession in {}'.format(path_not_created_accession_tsv))
with open(path_not_created_accession_tsv, 'w') as fw:
- fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+ fw.write('\n'.join(['\t'.join([accession_version, ','.join(missing_info_list)]) for accession_version, missing_info_list in not_created_accession_dict.items()]))
print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))