aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-09-04 09:59:47 +0200
committerAndreaGuarracino2020-09-04 09:59:47 +0200
commit842f1a25cc2632771e8a816dbb4898db81d7a6c2 (patch)
tree462fa2e7e6c42f7ca0cd054ab15afee109daac30
parent7067de8cfe0607fcae13360b80a3763b86f9664c (diff)
downloadbh20-seq-resource-842f1a25cc2632771e8a816dbb4898db81d7a6c2.tar.gz
bh20-seq-resource-842f1a25cc2632771e8a816dbb4898db81d7a6c2.tar.lz
bh20-seq-resource-842f1a25cc2632771e8a816dbb4898db81d7a6c2.zip
synchronized the create_sra_metadata.py script with the latest updates
-rw-r--r--scripts/create_sra_metadata/create_sra_metadata.py83
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py2
2 files changed, 65 insertions, 20 deletions
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 0e1215e..2a05d26 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -1,10 +1,19 @@
#!/usr/bin/env python3
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore in all steps, 1 id per line', required=False)
+parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
+ default='../dict_ontology_standardization/', required=False)
+
+args = parser.parse_args()
+
import os
from dateutil.parser import parse
import xml.etree.ElementTree as ET
import json
import gzip
+import sys
dir_yaml = 'yaml'
@@ -15,9 +24,30 @@ date = '2020.07.09'
# -> Send to -> File -> Full XML -> Create File
path_sra_metadata_xml = 'SraExperimentPackage.{}.xml.gz'.format(date)
-dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+dir_dict_ontology_standardization = args.dict_ontology
path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date)
+
+def is_integer(string_to_check):
+ try:
+ int(string_to_check)
+ return True
+ except ValueError:
+ return False
+
+
+accession_to_ignore_set = set()
+
+if args.ids_to_ignore:
+ if not os.path.exists(args.ids_to_ignore):
+ print("\tThe '{}' file doesn't exist.".format(args.ids_to_ignore))
+ sys.exit(-1)
+
+ with open(args.ids_to_ignore) as f:
+ accession_to_ignore_set.update(set([x.split('.')[0] for x in f.read().strip('\n').split('\n')]))
+ print('There are {} accessions to ignore.'.format(len(accession_to_ignore_set)))
+
+
term_to_uri_dict = {}
for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
@@ -38,16 +68,11 @@ for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_x
term_to_uri_dict[term] = uri
-def is_integer(string_to_check):
- try:
- int(string_to_check)
- return True
- except ValueError:
- return False
if not os.path.exists(dir_yaml):
os.makedirs(dir_yaml)
+
sra_metadata_xml_file = gzip.open(path_sra_metadata_xml, 'r')
tree = ET.parse(sra_metadata_xml_file)
sra_metadata_xml_file.close()
@@ -55,7 +80,7 @@ sra_metadata_xml_file.close()
EXPERIMENT_PACKAGE_SET = tree.getroot()
missing_value_list = []
-not_created_accession_list = []
+not_created_accession_dict = {}
run_accession_set = set()
run_accession_to_downloadble_file_url_dict = {}
@@ -170,7 +195,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if VALUE_text_list[1].isalpha():
date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
elif len(VALUE_text_list) == 2:
- date_to_write = VALUE_text + '-15'
+ date_to_write = VALUE_text + '-15'
else:
if int(VALUE_text) < 2020:
date_to_write = "{}-12-15".format(VALUE_text)
@@ -247,18 +272,37 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
+
+ # Check if mandatory fields are missing
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- #print(accession, ' - technology not found')
- not_created_accession_list.append([accession, 'technology not found'])
- continue
+ # print(accession_version, ' - technology not found')
+ if accession not in not_created_accession_dict:
+ not_created_accession_dict[accession] = []
+ not_created_accession_dict[accession].append('sample_sequencing_technology not found')
+
+ if 'collection_location' not in info_for_yaml_dict['sample']:
+ if accession not in not_created_accession_dict:
+ not_created_accession_dict[accession] = []
+ not_created_accession_dict[accession].append('collection_location not found')
+
+ if 'collection_date' not in info_for_yaml_dict['sample']:
+ if accession not in not_created_accession_dict:
+ not_created_accession_dict[accession] = []
+ not_created_accession_dict[accession].append('collection_date not found')
+
+ if 'authors' not in info_for_yaml_dict['submitter']:
+ if accession not in not_created_accession_dict:
+ not_created_accession_dict[accession] = []
+ not_created_accession_dict[accession].append('authors not found')
if 'host_species' not in info_for_yaml_dict['host']:
- #print(accession, ' - technology not found')
- not_created_accession_list.append([accession, 'missing host species'])
- continue
+ if accession not in not_created_accession_dict:
+ not_created_accession_dict[accession] = []
+ not_created_accession_dict[accession].append('host_species not found')
- with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
- json.dump(info_for_yaml_dict, fw, indent=2)
+ if accession not in not_created_accession_dict:
+ with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
+ json.dump(info_for_yaml_dict, fw, indent=2)
if len(missing_value_list) > 0:
path_missing_terms_tsv = 'missing_terms.sra.tsv'
@@ -266,9 +310,10 @@ if len(missing_value_list) > 0:
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
-if len(not_created_accession_list) > 0:
+if len(not_created_accession_dict) > 0:
path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
print('Written not created accession in {}'.format(path_not_created_accession_tsv))
with open(path_not_created_accession_tsv, 'w') as fw:
- fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+ fw.write('\n'.join(['\t'.join([accession_version, ','.join(missing_info_list)]) for accession_version, missing_info_list in not_created_accession_dict.items()]))
+
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 8f765d7..7251819 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -6,7 +6,7 @@ parser.add_argument('--ids-to-ignore', type=str, help='file with ids to ignore i
parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
parser.add_argument('--only-missing-ids', action='store_true', help='download only missing ids not already downloaded', required=False)
parser.add_argument('--dict-ontology', type=str, help='where is the ontology',
- default='../dict_ontology_standardization/',required=False)
+ default='../dict_ontology_standardization/', required=False)
args = parser.parse_args()
from Bio import Entrez