aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/cleanup.py41
-rw-r--r--scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gzbin6502056 -> 0 bytes
-rw-r--r--scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gzbin0 -> 9744133 bytes
-rw-r--r--scripts/create_sra_metadata/create_sra_metadata.py62
-rw-r--r--scripts/dict_ontology_standardization/ncbi_host_species.csv1
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py14
6 files changed, 91 insertions, 27 deletions
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
new file mode 100644
index 0000000..78f34c8
--- /dev/null
+++ b/scripts/cleanup.py
@@ -0,0 +1,41 @@
+import arvados
+import arvados.util
+
+api = arvados.api()
+
+delete_patterns = [
+ "%missing%`collection_location`%",
+ "%missing%`technology`%",
+ "%missing%`host_species`%",
+ "%QC fail: alignment%",
+ "%does not look like a valid URI%",
+ "%Duplicate of%",
+ "%No matching triples found for predicate obo:NCIT_C42781%",
+ "%does not look like a valid URI%"
+ ]
+
+revalidate_patterns = [
+ "%missing%`license`%",
+ "%QC fail%"
+]
+
+for p in delete_patterns:
+ c = arvados.util.list_all(api.collections().list, filters=[
+ ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+ ["properties.errors", "like", p]])
+ for i in c:
+ print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+ api.collections().delete(uuid=i["uuid"]).execute()
+
+for p in revalidate_patterns:
+ c = arvados.util.list_all(api.collections().list, filters=[
+ ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+ ["properties.errors", "like", p]])
+ for i in c:
+ print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+ pr = i["properties"]
+ if "status" in pr:
+ del pr["status"]
+ if "errors" in pr:
+ del pr["errors"]
+ api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
deleted file mode 100644
index 88acb18..0000000
--- a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
+++ /dev/null
Binary files differ
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
new file mode 100644
index 0000000..93ef550
--- /dev/null
+++ b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
Binary files differ
diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index ef0d119..352a30e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -8,7 +8,7 @@ import gzip
dir_yaml = 'yaml'
-date = '2020.07.05'
+date = '2020.07.09'
# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
# Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
@@ -50,13 +50,14 @@ sra_metadata_xml_file.close()
EXPERIMENT_PACKAGE_SET = tree.getroot()
missing_value_list = []
+not_created_accession_list = []
run_accession_set = set()
run_accession_to_downloadble_file_url_dict = {}
for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(i, EXPERIMENT_PACKAGE)
-
+
# A general default-empty yaml could be read from the definitive one
info_for_yaml_dict = {
'id': 'placeholder',
@@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
#print(accession)
info_for_yaml_dict['sample']['sample_id'] = accession
-
+
#SRAFiles = RUN.find('SRAFiles')
#if SRAFiles is not None:
# url = SRAFiles.find('SRAFile').attrib['url']
# if 'sra-download.ncbi.nlm.nih.gov' in url:
# run_accession_to_downloadble_file_url_dict[accession] = url
-
+
SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
-
+
for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
if VALUE is not None:
@@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
elif TAG_text in ['host_health_status', 'host health state']:
if VALUE_text in term_to_uri_dict:
- info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
+ info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
elif TAG_text in ['strain', 'isolate']:
@@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if value_to_insert in term_to_uri_dict:
value_to_insert = term_to_uri_dict[value_to_insert]
-
- if 'virus_strain' not in info_for_yaml_dict:
+
+ if 'virus_strain' not in info_for_yaml_dict:
info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
else:
info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
- elif TAG_text in ['isolation_source', 'isolation source host-associated']:
+ elif TAG_text in ['isolation_source', 'isolation source host-associated']:
if VALUE_text in term_to_uri_dict:
info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
else:
@@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
elif TAG_text == 'collected_by':
if VALUE_text.lower() not in ['not available', 'missing']:
name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
-
+
info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
elif TAG_text == 'collecting institution':
if VALUE_text.lower() not in ['not provided', 'na']:
@@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
date_to_write = VALUE_text
date_is_estimated = True
-
+
VALUE_text_list = VALUE_text.split('-')
if len(VALUE_text_list) == 3:
date_is_estimated = False
-
+
if VALUE_text_list[1].isalpha():
date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
elif len(VALUE_text_list) == 2:
@@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
date_to_write = "{}-01-15".format(VALUE_text)
info_for_yaml_dict['sample']['collection_date'] = date_to_write
-
+
if date_is_estimated:
if 'additional_collection_information' in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
@@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
-
-
+
+
EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
@@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
-
+
if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
- STUDY = EXPERIMENT_PACKAGE.find('STUDY')
+ STUDY = EXPERIMENT_PACKAGE.find('STUDY')
info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
-
-
+
+
Organization = EXPERIMENT_PACKAGE.find('Organization')
Organization_Name = Organization.find('Name')
info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
-
+
Organization_Contact = Organization.find('Contact')
if Organization_Contact is not None:
Organization_Contact_Name = Organization_Contact.find('Name')
@@ -231,20 +232,33 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
Organization_Address = Organization.find('Address')
if Organization_Address is not None:
info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
-
+
if 'collection_date' not in info_for_yaml_dict['sample']:
info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession, ' - technology not found')
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'technology not found'])
+ continue
+
+ if 'host_species' not in info_for_yaml_dict['host']:
+ #print(accession, ' - technology not found')
+ not_created_accession_list.append([accession, 'missing host species'])
continue
with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
json.dump(info_for_yaml_dict, fw, indent=2)
-
+
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.sra.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
+
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 40572a3..0bfc455 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
human,http://purl.obolibrary.org/obo/NCBITaxon_9606
Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
num_seq_with_len_ge_X_bp = 0
missing_value_list = []
+not_created_accession_list = []
accession_with_errors_list = []
for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
- print(accession_version, ' - technology not found')
+ #print(accession_version, ' - technology not found')
+ not_created_accession_list.append([accession_version, 'technology not found'])
continue
with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -389,15 +391,21 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
continue
if len(missing_value_list) > 0:
- path_missing_terms_tsv = 'missing_terms.tsv'
+ path_missing_terms_tsv = 'missing_terms.genbank.tsv'
print('Written missing terms in {}'.format(path_missing_terms_tsv))
with open(path_missing_terms_tsv, 'w') as fw:
fw.write('\n'.join(missing_value_list))
if len(accession_with_errors_list) > 0:
- path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+ path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
with open(path_accession_with_errors_tsv, 'w') as fw:
fw.write('\n'.join(accession_with_errors_list))
+if len(not_created_accession_list) > 0:
+ path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
+ print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+ with open(path_not_created_accession_tsv, 'w') as fw:
+ fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))