diff options
| author | AndreaGuarracino | 2020-08-25 00:02:32 +0200 | 
|---|---|---|
| committer | AndreaGuarracino | 2020-08-25 00:02:32 +0200 | 
| commit | 2baa88b766ec540bd34b96599014dd16e393af39 (patch) | |
| tree | 96ecbcedc36a427f03281cf27389e5ee92af8648 /scripts | |
| parent | 5c44403e4516b6c809ecde1398b306bbde2c6727 (diff) | |
| download | bh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.tar.gz bh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.tar.lz bh20-seq-resource-2baa88b766ec540bd34b96599014dd16e393af39.zip | |
the YAML/FASTA pair is not created for samples where at least one mandatory field is missing
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 34 | 
1 files changed, 29 insertions, 5 deletions
| diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index f314a1d..eefdddb 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -139,7 +139,7 @@ min_len_to_count = 27500 num_seq_with_len_ge_X_bp = 0 missing_value_list = [] -not_created_accession_list = [] +not_created_accession_dict = {} accession_with_errors_list = [] for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: @@ -374,10 +374,34 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) elif GBQualifier_name_text == 'db_xref': info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1] - + # Check if mandatory fields are missing if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: #print(accession_version, ' - technology not found') - not_created_accession_list.append([accession_version, 'technology not found']) + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('sample_sequencing_technology not found') + + if 'collection_location' not in info_for_yaml_dict['sample']: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('collection_location not found') + + if 'collection_date' not in info_for_yaml_dict['sample']: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('collection_date not found') + + if 'authors' not in info_for_yaml_dict['submitter']: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('authors not found') + + if 'host_species' not in info_for_yaml_dict['host']: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('host_species not found') + + if accession_version in not_created_accession_dict: continue with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: @@ -406,10 +430,10 @@ if len(accession_with_errors_list) > 0: with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) -if len(not_created_accession_list) > 0: +if len(not_created_accession_dict) > 0: path_not_created_accession_tsv = 'not_created_accession.genbank.tsv' print('Written not created accession in {}'.format(path_not_created_accession_tsv)) with open(path_not_created_accession_tsv, 'w') as fw: - fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) + fw.write('\n'.join(['\t'.join([accession_version, ','.join(missing_info_list)]) for accession_version, missing_info_list in not_created_accession_dict.items()])) print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) | 
