diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/create_sra_metadata/create_sra_metadata.py | 11 | ||||
-rwxr-xr-x | scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 12 |
2 files changed, 22 insertions, 1 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index 554aea3..77cdf0d 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -14,6 +14,7 @@ from dateutil.parser import parse import xml.etree.ElementTree as ET import json import gzip +from datetime import datetime import sys sys.path.append('../') @@ -23,6 +24,8 @@ dir_yaml = 'yaml' date = '2020.07.09' +min_acceptable_collection_date = datetime(2019, 12, 1) + # Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D) # Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases) # -> Send to -> File -> Full XML -> Create File @@ -283,6 +286,14 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if accession not in not_created_accession_dict: not_created_accession_dict[accession] = [] not_created_accession_dict[accession].append('collection_date not found') + else: + year, month, day = [int(x) for x in info_for_yaml_dict['sample']['collection_date'].split('-')] + + collection_date_in_yaml = datetime(year, month, day) + + if accession not in not_created_accession_dict: + not_created_accession_dict[accession] = [] + not_created_accession_dict[accession].append('collection_date too early') if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: # print(accession_version, ' - technology not found') diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 5a8a336..8fbacd1 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -18,7 +18,7 @@ import json import os import requests -from datetime import date +from datetime import date, datetime from dateutil.parser import parse import sys @@ -27,6 +27,7 @@ from utils import is_integer, chunks, check_and_get_ontology_dictionaries num_ids_for_request = 100 +min_acceptable_collection_date = datetime(2019, 12, 1) dir_metadata = 'metadata_from_nuccore' dir_fasta_and_yaml = 'fasta_and_yaml' @@ -404,6 +405,15 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if accession_version not in not_created_accession_dict: not_created_accession_dict[accession_version] = [] not_created_accession_dict[accession_version].append('collection_date not found') + else: + year, month, day = [int(x) for x in info_for_yaml_dict['sample']['collection_date'].split('-')] + + collection_date_in_yaml = datetime(year, month, day) + + if collection_date_in_yaml < min_acceptable_collection_date: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] + not_created_accession_dict[accession_version].append('collection_date too early') if 'authors' not in info_for_yaml_dict['submitter']: if accession_version not in not_created_accession_dict: |