From 87e634040767373309fd7eb99784de0537f72059 Mon Sep 17 00:00:00 2001 From: Andrea Guarracino Date: Fri, 10 Jul 2020 11:50:06 +0200 Subject: other term for Homo sapiens (for SRA samples) --- scripts/dict_ontology_standardization/ncbi_host_species.csv | 1 + 1 file changed, 1 insertion(+) (limited to 'scripts') diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv index 40572a3..0bfc455 100644 --- a/scripts/dict_ontology_standardization/ncbi_host_species.csv +++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv @@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 human,http://purl.obolibrary.org/obo/NCBITaxon_9606 Human,http://purl.obolibrary.org/obo/NCBITaxon_9606 sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 +homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685 -- cgit v1.2.3 From 1655762b516804dad3d71538e95d97d74653c3e9 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 10 Jul 2020 13:43:59 +0200 Subject: updated metadata source --- .../SraExperimentPackage.2020.07.05.xml.gz | Bin 6502056 -> 0 bytes .../SraExperimentPackage.2020.07.09.xml.gz | Bin 0 -> 9744133 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz create mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz (limited to 'scripts') diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz deleted file mode 100644 index 88acb18..0000000 Binary files a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz and /dev/null differ diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz new file mode 100644 index 0000000..93ef550 Binary files /dev/null and b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz differ -- cgit v1.2.3 From 8cb542fdf60273aec7ec107f8bc4896375381263 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 10 Jul 2020 13:55:49 +0200 Subject: an output file is created with the accessions for which no YAML file is created --- scripts/create_sra_metadata/create_sra_metadata.py | 55 +++++++++++++--------- .../from_genbank_to_fasta_and_yaml.py | 10 +++- 2 files changed, 41 insertions(+), 24 deletions(-) (limited to 'scripts') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index ef0d119..10ac85b 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -8,7 +8,7 @@ import gzip dir_yaml = 'yaml' -date = '2020.07.05' +date = '2020.07.09' # Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D) # Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases) @@ -50,13 +50,14 @@ sra_metadata_xml_file.close() EXPERIMENT_PACKAGE_SET = tree.getroot() missing_value_list = [] +not_created_accession_list = [] run_accession_set = set() run_accession_to_downloadble_file_url_dict = {} for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): #print(i, EXPERIMENT_PACKAGE) - + # A general default-empty yaml could be read from the definitive one info_for_yaml_dict = { 'id': 'placeholder', @@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): #print(accession) info_for_yaml_dict['sample']['sample_id'] = accession - + #SRAFiles = RUN.find('SRAFiles') #if SRAFiles is not None: # url = SRAFiles.find('SRAFile').attrib['url'] # if 'sra-download.ncbi.nlm.nih.gov' in url: # run_accession_to_downloadble_file_url_dict[accession] = url - + SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE') SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE') - + for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list: VALUE = SAMPLE_ATTRIBUTE.find('VALUE') if VALUE is not None: @@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text])) elif TAG_text in ['host_health_status', 'host health state']: if VALUE_text in term_to_uri_dict: - info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text] + info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text] elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']: missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text])) elif TAG_text in ['strain', 'isolate']: @@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if value_to_insert in term_to_uri_dict: value_to_insert = term_to_uri_dict[value_to_insert] - - if 'virus_strain' not in info_for_yaml_dict: + + if 'virus_strain' not in info_for_yaml_dict: info_for_yaml_dict['virus']['virus_strain'] = value_to_insert else: info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert - elif TAG_text in ['isolation_source', 'isolation source host-associated']: + elif TAG_text in ['isolation_source', 'isolation source host-associated']: if VALUE_text in term_to_uri_dict: info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]] else: @@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): elif TAG_text == 'collected_by': if VALUE_text.lower() not in ['not available', 'missing']: name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli'] - + info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text elif TAG_text == 'collecting institution': if VALUE_text.lower() not in ['not provided', 'na']: @@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): if VALUE_text.lower() not in ['not applicable', 'missing', 'na']: date_to_write = VALUE_text date_is_estimated = True - + VALUE_text_list = VALUE_text.split('-') if len(VALUE_text_list) == 3: date_is_estimated = False - + if VALUE_text_list[1].isalpha(): date_to_write = parse(VALUE_text).strftime('%Y-%m-%d') elif len(VALUE_text_list) == 2: @@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): date_to_write = "{}-01-15".format(VALUE_text) info_for_yaml_dict['sample']['collection_date'] = date_to_write - + if date_is_estimated: if 'additional_collection_information' in info_for_yaml_dict['sample']: info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text) @@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id - - + + EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT') INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0] @@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION') info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession'] - + if SUBMISSION.attrib['lab_name'].lower() not in ['na']: info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name'] - STUDY = EXPERIMENT_PACKAGE.find('STUDY') + STUDY = EXPERIMENT_PACKAGE.find('STUDY') info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias'] - - + + Organization = EXPERIMENT_PACKAGE.find('Organization') Organization_Name = Organization.find('Name') info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text] - + Organization_Contact = Organization.find('Contact') if Organization_Contact is not None: Organization_Contact_Name = Organization_Contact.find('Name') @@ -231,20 +232,28 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): Organization_Address = Organization.find('Address') if Organization_Address is not None: info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']]) - + if 'collection_date' not in info_for_yaml_dict['sample']: info_for_yaml_dict['sample']['collection_date'] = '1970-01-01' info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing" if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: - print(accession, ' - technology not found') + #print(accession, ' - technology not found') + not_created_accession_list.append([accession, 'technology not found']) continue with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) - + if len(missing_value_list) > 0: path_missing_terms_tsv = 'missing_terms.tsv' print('Written missing terms in {}'.format(path_missing_terms_tsv)) with open(path_missing_terms_tsv, 'w') as fw: fw.write('\n'.join(missing_value_list)) + +if len(not_created_accession_list) > 0: + path_not_created_accession_tsv = 'not_created_accession.tsv' + print('Written not created accession in {}'.format(path_not_created_accession_tsv)) + with open(path_not_created_accession_tsv, 'w') as fw: + fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) + diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 39e401a..d5b0ffd 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -138,6 +138,7 @@ min_len_to_count = 27500 num_seq_with_len_ge_X_bp = 0 missing_value_list = [] +not_created_accession_list = [] accession_with_errors_list = [] for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]: @@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']: - print(accession_version, ' - technology not found') + #print(accession_version, ' - technology not found') + not_created_accession_list.append([accession_version, 'technology not found']) continue with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw: @@ -400,4 +402,10 @@ if len(accession_with_errors_list) > 0: with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) +if len(not_created_accession_list) > 0: + path_not_created_accession_tsv = 'not_created_accession.tsv' + print('Written not created accession in {}'.format(path_not_created_accession_tsv)) + with open(path_not_created_accession_tsv, 'w') as fw: + fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) + print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp)) -- cgit v1.2.3 From 2eab71a70b8630649303a9319e1baf9fa06f8ab4 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 10 Jul 2020 15:39:02 +0200 Subject: metadata with missing host_species are not created --- scripts/create_sra_metadata/create_sra_metadata.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'scripts') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index 10ac85b..a31bd36 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -242,6 +242,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): not_created_accession_list.append([accession, 'technology not found']) continue + if 'host_species' not in info_for_yaml_dict['host']: + #print(accession, ' - technology not found') + not_created_accession_list.append([accession, 'missing host species']) + continue + with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw: json.dump(info_for_yaml_dict, fw, indent=2) -- cgit v1.2.3 From bb90f06da570624952d4b7001ee37fc7018e3a7d Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Sun, 12 Jul 2020 15:58:29 +0200 Subject: added a suffix to distinguish which script created the error/warning files --- scripts/create_sra_metadata/create_sra_metadata.py | 4 ++-- scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'scripts') diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py index a31bd36..352a30e 100644 --- a/scripts/create_sra_metadata/create_sra_metadata.py +++ b/scripts/create_sra_metadata/create_sra_metadata.py @@ -251,13 +251,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET): json.dump(info_for_yaml_dict, fw, indent=2) if len(missing_value_list) > 0: - path_missing_terms_tsv = 'missing_terms.tsv' + path_missing_terms_tsv = 'missing_terms.sra.tsv' print('Written missing terms in {}'.format(path_missing_terms_tsv)) with open(path_missing_terms_tsv, 'w') as fw: fw.write('\n'.join(missing_value_list)) if len(not_created_accession_list) > 0: - path_not_created_accession_tsv = 'not_created_accession.tsv' + path_not_created_accession_tsv = 'not_created_accession.sra.tsv' print('Written not created accession in {}'.format(path_not_created_accession_tsv)) with open(path_not_created_accession_tsv, 'w') as fw: fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index d5b0ffd..dbebfbb 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -391,19 +391,19 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) continue if len(missing_value_list) > 0: - path_missing_terms_tsv = 'missing_terms.tsv' + path_missing_terms_tsv = 'missing_terms.genbank.tsv' print('Written missing terms in {}'.format(path_missing_terms_tsv)) with open(path_missing_terms_tsv, 'w') as fw: fw.write('\n'.join(missing_value_list)) if len(accession_with_errors_list) > 0: - path_accession_with_errors_tsv = 'accession_with_errors.tsv' + path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv' print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv)) with open(path_accession_with_errors_tsv, 'w') as fw: fw.write('\n'.join(accession_with_errors_list)) if len(not_created_accession_list) > 0: - path_not_created_accession_tsv = 'not_created_accession.tsv' + path_not_created_accession_tsv = 'not_created_accession.genbank.tsv' print('Written not created accession in {}'.format(path_not_created_accession_tsv)) with open(path_not_created_accession_tsv, 'w') as fw: fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list])) -- cgit v1.2.3 From 6bfefe984a84fb215d61e045c49a4ab123bb7339 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 12:32:43 -0400 Subject: Catch exceptions Add script to cleanup bad uploads. Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20seqanalyzer/main.py | 19 +++++++++++-------- scripts/cleanup.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 scripts/cleanup.py (limited to 'scripts') diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index f2bb234..f18a93a 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -364,17 +364,20 @@ def main(): logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project)) while True: - seqanalyzer.move_fastq_to_fasta_results() + try: + seqanalyzer.move_fastq_to_fasta_results() - new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]]) - at_least_one_new_valid_seq = False - for c in new_collections: - at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq + new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]]) + at_least_one_new_valid_seq = False + for c in new_collections: + at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq - if at_least_one_new_valid_seq and not args.no_start_analysis: - seqanalyzer.start_pangenome_analysis() + if at_least_one_new_valid_seq and not args.no_start_analysis: + seqanalyzer.start_pangenome_analysis() - seqanalyzer.copy_most_recent_result() + seqanalyzer.copy_most_recent_result() + except Exception as e: + logging.exeception("Error in main loop") if args.once: break diff --git a/scripts/cleanup.py b/scripts/cleanup.py new file mode 100644 index 0000000..f4bd0b4 --- /dev/null +++ b/scripts/cleanup.py @@ -0,0 +1,20 @@ +import arvados +import arvados.util + +api = arvados.api() + +patterns = [ + "%missing%`collection_location`%", + "%missing%`technology`%", + "%missing%`host_species`%", + "%QC fail: alignment%", + "%does not look like a valid URI%", + ] + +for p in patterns: + c = arvados.util.list_all(api.collections().list, filters=[ + ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"], + ["properties.errors", "like", p]]) + for i in c: + print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label"))) + api.collections().delete(uuid=i["uuid"]).execute() -- cgit v1.2.3 From 474d15e17be63046a091615e89ba63adecdb109b Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 14:28:02 -0400 Subject: Cleanup script also clears errors for revalidate Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- scripts/cleanup.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/cleanup.py b/scripts/cleanup.py index f4bd0b4..6a82659 100644 --- a/scripts/cleanup.py +++ b/scripts/cleanup.py @@ -3,18 +3,36 @@ import arvados.util api = arvados.api() -patterns = [ +delete_patterns = [ "%missing%`collection_location`%", "%missing%`technology`%", "%missing%`host_species`%", "%QC fail: alignment%", "%does not look like a valid URI%", + "%Duplicate of%" ] -for p in patterns: +revalidate_patterns = [ + "%missing%`license`%" +] + +for p in delete_patterns: c = arvados.util.list_all(api.collections().list, filters=[ ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"], ["properties.errors", "like", p]]) for i in c: print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label"))) api.collections().delete(uuid=i["uuid"]).execute() + +for p in revalidate_patterns: + c = arvados.util.list_all(api.collections().list, filters=[ + ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"], + ["properties.errors", "like", p]]) + for i in c: + print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label"))) + pr = i["properties"] + if "status" in pr: + del pr["status"] + if "errors" in pr: + del pr["errors"] + api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute() -- cgit v1.2.3 From b1750731b654be3322a6793f47d52fafcaaea9ac Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 16 Jul 2020 21:24:05 -0400 Subject: Report similarity == 0 Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- bh20sequploader/qc_fasta.py | 4 +--- scripts/cleanup.py | 7 +++++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'scripts') diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index 37eb4e8..0c7e16d 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True): except Exception as e: logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e) - if similarity and similarity < 70.0: + if similarity < 70.0: raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity)) - if similarity == 0: - raise ValueError("QC fail") return ("sequence.fasta"+gz, seqlabel) elif seq_type == "text/fastq": diff --git a/scripts/cleanup.py b/scripts/cleanup.py index 6a82659..78f34c8 100644 --- a/scripts/cleanup.py +++ b/scripts/cleanup.py @@ -9,11 +9,14 @@ delete_patterns = [ "%missing%`host_species`%", "%QC fail: alignment%", "%does not look like a valid URI%", - "%Duplicate of%" + "%Duplicate of%", + "%No matching triples found for predicate obo:NCIT_C42781%", + "%does not look like a valid URI%" ] revalidate_patterns = [ - "%missing%`license`%" + "%missing%`license`%", + "%QC fail%" ] for p in delete_patterns: -- cgit v1.2.3