to not create YAML files with date before 2019 December

author: AndreaGuarracino 2020-11-13 22:04:36 +0100
committer: AndreaGuarracino 2020-11-13 22:04:36 +0100
commit: a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e (patch)
tree: e2544a90619a2001a5a3cb2332c681d0128ff1a5 /scripts
parent: 852fbbd14a174addcfe729ab2b54e556228984a4 (diff)
download: bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.tar.gz
bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.tar.lz
bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.zip
2 files changed, 22 insertions, 1 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 554aea3..77cdf0d 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -14,6 +14,7 @@ from dateutil.parser import parse
 import xml.etree.ElementTree as ET
 import json
 import gzip
+from datetime import datetime
 
 import sys
 sys.path.append('../')
@@ -23,6 +24,8 @@ dir_yaml = 'yaml'
 
 date = '2020.07.09'
 
+min_acceptable_collection_date = datetime(2019, 12, 1)
+
 # Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
 # Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
 #         -> Send to -> File -> Full XML -> Create File
@@ -283,6 +286,14 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
         if accession not in not_created_accession_dict:
             not_created_accession_dict[accession] = []
         not_created_accession_dict[accession].append('collection_date not found')
+    else:
+        year, month, day = [int(x) for x in info_for_yaml_dict['sample']['collection_date'].split('-')]
+
+        collection_date_in_yaml = datetime(year, month, day)
+
+        if accession not in not_created_accession_dict:
+            not_created_accession_dict[accession] = []
+        not_created_accession_dict[accession].append('collection_date too early')
 
     if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
         # print(accession_version, ' - technology not found')
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 5a8a336..8fbacd1 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -18,7 +18,7 @@ import json
 import os
 import requests
 
-from datetime import date
+from datetime import date, datetime
 from dateutil.parser import parse
 
 import sys
@@ -27,6 +27,7 @@ from utils import is_integer, chunks, check_and_get_ontology_dictionaries
 
 
 num_ids_for_request = 100
+min_acceptable_collection_date = datetime(2019, 12, 1)
 
 dir_metadata = 'metadata_from_nuccore'
 dir_fasta_and_yaml = 'fasta_and_yaml'
@@ -404,6 +405,15 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                 if accession_version not in not_created_accession_dict:
                     not_created_accession_dict[accession_version] = []
                 not_created_accession_dict[accession_version].append('collection_date not found')
+            else:
+                year, month, day = [int(x) for x in info_for_yaml_dict['sample']['collection_date'].split('-')]
+
+                collection_date_in_yaml = datetime(year, month, day)
+
+                if collection_date_in_yaml < min_acceptable_collection_date:
+                    if accession_version not in not_created_accession_dict:
+                        not_created_accession_dict[accession_version] = []
+                    not_created_accession_dict[accession_version].append('collection_date too early')
 
             if 'authors' not in info_for_yaml_dict['submitter']:
                 if accession_version not in not_created_accession_dict:
author	AndreaGuarracino	2020-11-13 22:04:36 +0100
committer	AndreaGuarracino	2020-11-13 22:04:36 +0100
commit	a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e (patch)
tree	e2544a90619a2001a5a3cb2332c681d0128ff1a5 /scripts
parent	852fbbd14a174addcfe729ab2b54e556228984a4 (diff)
download	bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.tar.gz bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.tar.lz bh20-seq-resource-a5f5a85b9db5e84f3680e85b7116f324fd6d2e2e.zip