From 87e634040767373309fd7eb99784de0537f72059 Mon Sep 17 00:00:00 2001
From: Andrea Guarracino
Date: Fri, 10 Jul 2020 11:50:06 +0200
Subject: other term for Homo sapiens (for SRA samples)

---
 scripts/dict_ontology_standardization/ncbi_host_species.csv | 1 +
 1 file changed, 1 insertion(+)

(limited to 'scripts')

diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 40572a3..0bfc455 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -2,6 +2,7 @@ Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
+homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
-- 
cgit 1.4.1


From 1655762b516804dad3d71538e95d97d74653c3e9 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 13:43:59 +0200
Subject: updated metadata source

---
 .../SraExperimentPackage.2020.07.05.xml.gz            | Bin 6502056 -> 0 bytes
 .../SraExperimentPackage.2020.07.09.xml.gz            | Bin 0 -> 9744133 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
 create mode 100644 scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz

(limited to 'scripts')

diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz
deleted file mode 100644
index 88acb18..0000000
Binary files a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.05.xml.gz and /dev/null differ
diff --git a/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
new file mode 100644
index 0000000..93ef550
Binary files /dev/null and b/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz differ
-- 
cgit 1.4.1


From 8cb542fdf60273aec7ec107f8bc4896375381263 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 13:55:49 +0200
Subject: an output file is created with the accessions for which no YAML file
 is created

---
 scripts/create_sra_metadata/create_sra_metadata.py | 55 +++++++++++++---------
 .../from_genbank_to_fasta_and_yaml.py              | 10 +++-
 2 files changed, 41 insertions(+), 24 deletions(-)

(limited to 'scripts')

diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index ef0d119..10ac85b 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -8,7 +8,7 @@ import gzip
 
 dir_yaml = 'yaml'
 
-date = '2020.07.05'
+date = '2020.07.09'
 
 # Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D)
 # Query on SRA: 'txid2697049[Organism:noexp] NOT 0[Mbases ' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism:noexp%5D%20NOT%200[Mbases)
@@ -50,13 +50,14 @@ sra_metadata_xml_file.close()
 EXPERIMENT_PACKAGE_SET = tree.getroot()
 
 missing_value_list = []
+not_created_accession_list = []
 
 run_accession_set = set()
 run_accession_to_downloadble_file_url_dict = {}
 
 for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     #print(i, EXPERIMENT_PACKAGE)
-    
+
     # A general default-empty yaml could be read from the definitive one
     info_for_yaml_dict = {
         'id': 'placeholder',
@@ -74,17 +75,17 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     #print(accession)
 
     info_for_yaml_dict['sample']['sample_id'] = accession
-    
+
     #SRAFiles = RUN.find('SRAFiles')
     #if SRAFiles is not None:
     #    url = SRAFiles.find('SRAFile').attrib['url']
     #    if 'sra-download.ncbi.nlm.nih.gov' in url:
     #        run_accession_to_downloadble_file_url_dict[accession] = url
-  
+
 
     SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
     SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
-    
+
     for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
         VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
         if VALUE is not None:
@@ -101,7 +102,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                     missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
             elif TAG_text in ['host_health_status', 'host health state']:
                 if VALUE_text in term_to_uri_dict:
-                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]                            
+                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]
                 elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
                     missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
             elif TAG_text in ['strain', 'isolate']:
@@ -113,12 +114,12 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
                     if value_to_insert in term_to_uri_dict:
                         value_to_insert = term_to_uri_dict[value_to_insert]
-                        
-                    if 'virus_strain' not in info_for_yaml_dict:                        
+
+                    if 'virus_strain' not in info_for_yaml_dict:
                         info_for_yaml_dict['virus']['virus_strain'] = value_to_insert
                     else:
                         info_for_yaml_dict['virus']['virus_strain'] += '; ' + value_to_insert
-            elif TAG_text in ['isolation_source', 'isolation source host-associated']:                    
+            elif TAG_text in ['isolation_source', 'isolation source host-associated']:
                 if VALUE_text in term_to_uri_dict:
                     info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
                 else:
@@ -145,7 +146,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
             elif TAG_text == 'collected_by':
                 if VALUE_text.lower() not in ['not available', 'missing']:
                     name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
-                    
+
                     info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
             elif TAG_text == 'collecting institution':
                 if VALUE_text.lower() not in ['not provided', 'na']:
@@ -154,11 +155,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                 if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
                     date_to_write = VALUE_text
                     date_is_estimated = True
-                    
+
                     VALUE_text_list = VALUE_text.split('-')
                     if len(VALUE_text_list) == 3:
                         date_is_estimated = False
-                        
+
                         if VALUE_text_list[1].isalpha():
                             date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
                     elif len(VALUE_text_list) == 2:
@@ -170,7 +171,7 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
                             date_to_write = "{}-01-15".format(VALUE_text)
 
                     info_for_yaml_dict['sample']['collection_date'] = date_to_write
-                    
+
                     if date_is_estimated:
                         if 'additional_collection_information' in info_for_yaml_dict['sample']:
                             info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
@@ -188,8 +189,8 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
     taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
     info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
-    
-    
+
+
     EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
     INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
 
@@ -206,18 +207,18 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
 
     SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
     info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
-    
+
     if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
         info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
 
-    STUDY = EXPERIMENT_PACKAGE.find('STUDY')     
+    STUDY = EXPERIMENT_PACKAGE.find('STUDY')
     info_for_yaml_dict['submitter']['publication'] = STUDY.attrib['alias']
-    
-    
+
+
     Organization = EXPERIMENT_PACKAGE.find('Organization')
     Organization_Name = Organization.find('Name')
     info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
-        
+
     Organization_Contact = Organization.find('Contact')
     if Organization_Contact is not None:
         Organization_Contact_Name = Organization_Contact.find('Name')
@@ -231,20 +232,28 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
     Organization_Address = Organization.find('Address')
     if Organization_Address is not None:
         info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
-    
+
     if 'collection_date' not in info_for_yaml_dict['sample']:
         info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
         info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
 
     if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-        print(accession, ' - technology not found')
+        #print(accession, ' - technology not found')
+        not_created_accession_list.append([accession, 'technology not found'])
         continue
 
     with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
         json.dump(info_for_yaml_dict, fw, indent=2)
-    
+
 if len(missing_value_list) > 0:
     path_missing_terms_tsv = 'missing_terms.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
+
+if len(not_created_accession_list) > 0:
+    path_not_created_accession_tsv = 'not_created_accession.tsv'
+    print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+    with open(path_not_created_accession_tsv, 'w') as fw:
+        fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 39e401a..d5b0ffd 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -138,6 +138,7 @@ min_len_to_count = 27500
 num_seq_with_len_ge_X_bp = 0
 
 missing_value_list = []
+not_created_accession_list = []
 accession_with_errors_list = []
 
 for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
@@ -371,7 +372,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
 
 
             if 'sample_sequencing_technology' not in info_for_yaml_dict['technology']:
-                print(accession_version, ' - technology not found')
+                #print(accession_version, ' - technology not found')
+                not_created_accession_list.append([accession_version, 'technology not found'])
                 continue
 
             with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
@@ -400,4 +402,10 @@ if len(accession_with_errors_list) > 0:
     with open(path_accession_with_errors_tsv, 'w') as fw:
         fw.write('\n'.join(accession_with_errors_list))
 
+if len(not_created_accession_list) > 0:
+    path_not_created_accession_tsv = 'not_created_accession.tsv'
+    print('Written not created accession in {}'.format(path_not_created_accession_tsv))
+    with open(path_not_created_accession_tsv, 'w') as fw:
+        fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
+
 print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
-- 
cgit 1.4.1


From 2eab71a70b8630649303a9319e1baf9fa06f8ab4 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 10 Jul 2020 15:39:02 +0200
Subject: metadata with missing host_species are not created

---
 scripts/create_sra_metadata/create_sra_metadata.py | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'scripts')

diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index 10ac85b..a31bd36 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -242,6 +242,11 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
         not_created_accession_list.append([accession, 'technology not found'])
         continue
 
+    if 'host_species' not in info_for_yaml_dict['host']:
+        #print(accession, ' - technology not found')
+        not_created_accession_list.append([accession, 'missing host species'])
+        continue
+
     with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
         json.dump(info_for_yaml_dict, fw, indent=2)
 
-- 
cgit 1.4.1


From bb90f06da570624952d4b7001ee37fc7018e3a7d Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Sun, 12 Jul 2020 15:58:29 +0200
Subject: added a suffix to distinguish which script created the error/warning
 files

---
 scripts/create_sra_metadata/create_sra_metadata.py              | 4 ++--
 scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'scripts')

diff --git a/scripts/create_sra_metadata/create_sra_metadata.py b/scripts/create_sra_metadata/create_sra_metadata.py
index a31bd36..352a30e 100644
--- a/scripts/create_sra_metadata/create_sra_metadata.py
+++ b/scripts/create_sra_metadata/create_sra_metadata.py
@@ -251,13 +251,13 @@ for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
         json.dump(info_for_yaml_dict, fw, indent=2)
 
 if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
+    path_missing_terms_tsv = 'missing_terms.sra.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
 
 if len(not_created_accession_list) > 0:
-    path_not_created_accession_tsv = 'not_created_accession.tsv'
+    path_not_created_accession_tsv = 'not_created_accession.sra.tsv'
     print('Written not created accession in {}'.format(path_not_created_accession_tsv))
     with open(path_not_created_accession_tsv, 'w') as fw:
         fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index d5b0ffd..dbebfbb 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -391,19 +391,19 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
             continue
 
 if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
+    path_missing_terms_tsv = 'missing_terms.genbank.tsv'
     print('Written missing terms in {}'.format(path_missing_terms_tsv))
     with open(path_missing_terms_tsv, 'w') as fw:
         fw.write('\n'.join(missing_value_list))
 
 if len(accession_with_errors_list) > 0:
-    path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+    path_accession_with_errors_tsv = 'accession_with_errors.genbank.tsv'
     print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
     with open(path_accession_with_errors_tsv, 'w') as fw:
         fw.write('\n'.join(accession_with_errors_list))
 
 if len(not_created_accession_list) > 0:
-    path_not_created_accession_tsv = 'not_created_accession.tsv'
+    path_not_created_accession_tsv = 'not_created_accession.genbank.tsv'
     print('Written not created accession in {}'.format(path_not_created_accession_tsv))
     with open(path_not_created_accession_tsv, 'w') as fw:
         fw.write('\n'.join(['\t'.join(x) for x in not_created_accession_list]))
-- 
cgit 1.4.1


From 6bfefe984a84fb215d61e045c49a4ab123bb7339 Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Thu, 16 Jul 2020 12:32:43 -0400
Subject: Catch exceptions

Add script to cleanup bad uploads.

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20seqanalyzer/main.py | 19 +++++++++++--------
 scripts/cleanup.py      | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 8 deletions(-)
 create mode 100644 scripts/cleanup.py

(limited to 'scripts')

diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index f2bb234..f18a93a 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -364,17 +364,20 @@ def main():
     logging.info("Starting up, monitoring %s for uploads" % (args.uploader_project))
 
     while True:
-        seqanalyzer.move_fastq_to_fasta_results()
+        try:
+            seqanalyzer.move_fastq_to_fasta_results()
 
-        new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
-        at_least_one_new_valid_seq = False
-        for c in new_collections:
-            at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq
+            new_collections = arvados.util.list_all(api.collections().list, filters=[["owner_uuid", "=", args.uploader_project]])
+            at_least_one_new_valid_seq = False
+            for c in new_collections:
+                at_least_one_new_valid_seq = seqanalyzer.validate_upload(c, args.revalidate) or at_least_one_new_valid_seq
 
-        if at_least_one_new_valid_seq and not args.no_start_analysis:
-            seqanalyzer.start_pangenome_analysis()
+            if at_least_one_new_valid_seq and not args.no_start_analysis:
+                seqanalyzer.start_pangenome_analysis()
 
-        seqanalyzer.copy_most_recent_result()
+            seqanalyzer.copy_most_recent_result()
+        except Exception as e:
+            logging.exeception("Error in main loop")
 
         if args.once:
             break
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
new file mode 100644
index 0000000..f4bd0b4
--- /dev/null
+++ b/scripts/cleanup.py
@@ -0,0 +1,20 @@
+import arvados
+import arvados.util
+
+api = arvados.api()
+
+patterns = [
+    "%missing%`collection_location`%",
+    "%missing%`technology`%",
+    "%missing%`host_species`%",
+    "%QC fail: alignment%",
+    "%does not look like a valid URI%",
+    ]
+
+for p in patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        api.collections().delete(uuid=i["uuid"]).execute()
-- 
cgit 1.4.1


From 474d15e17be63046a091615e89ba63adecdb109b Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Thu, 16 Jul 2020 14:28:02 -0400
Subject: Cleanup script also clears errors for revalidate

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 scripts/cleanup.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'scripts')

diff --git a/scripts/cleanup.py b/scripts/cleanup.py
index f4bd0b4..6a82659 100644
--- a/scripts/cleanup.py
+++ b/scripts/cleanup.py
@@ -3,18 +3,36 @@ import arvados.util
 
 api = arvados.api()
 
-patterns = [
+delete_patterns = [
     "%missing%`collection_location`%",
     "%missing%`technology`%",
     "%missing%`host_species`%",
     "%QC fail: alignment%",
     "%does not look like a valid URI%",
+    "%Duplicate of%"
     ]
 
-for p in patterns:
+revalidate_patterns = [
+    "%missing%`license`%"
+]
+
+for p in delete_patterns:
     c = arvados.util.list_all(api.collections().list, filters=[
         ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
         ["properties.errors", "like", p]])
     for i in c:
         print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
         api.collections().delete(uuid=i["uuid"]).execute()
+
+for p in revalidate_patterns:
+    c = arvados.util.list_all(api.collections().list, filters=[
+        ["owner_uuid", "=", "lugli-j7d0g-n5clictpuvwk8aa"],
+        ["properties.errors", "like", p]])
+    for i in c:
+        print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
+        pr = i["properties"]
+        if "status" in pr:
+            del pr["status"]
+        if "errors" in pr:
+            del pr["errors"]
+        api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()
-- 
cgit 1.4.1


From b1750731b654be3322a6793f47d52fafcaaea9ac Mon Sep 17 00:00:00 2001
From: Peter Amstutz
Date: Thu, 16 Jul 2020 21:24:05 -0400
Subject: Report similarity == 0

Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
---
 bh20sequploader/qc_fasta.py | 4 +---
 scripts/cleanup.py          | 7 +++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'scripts')

diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 37eb4e8..0c7e16d 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -84,10 +84,8 @@ def qc_fasta(arg_sequence, check_with_clustalw=True):
                 except Exception as e:
                     logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
 
-                if similarity and similarity < 70.0:
+                if similarity < 70.0:
                     raise ValueError("QC fail: alignment to reference was less than 70%% (was %2.2f%%)" % (similarity))
-                if similarity == 0:
-                    raise ValueError("QC fail")
 
         return ("sequence.fasta"+gz, seqlabel)
     elif seq_type == "text/fastq":
diff --git a/scripts/cleanup.py b/scripts/cleanup.py
index 6a82659..78f34c8 100644
--- a/scripts/cleanup.py
+++ b/scripts/cleanup.py
@@ -9,11 +9,14 @@ delete_patterns = [
     "%missing%`host_species`%",
     "%QC fail: alignment%",
     "%does not look like a valid URI%",
-    "%Duplicate of%"
+    "%Duplicate of%",
+    "%No matching triples found for predicate obo:NCIT_C42781%",
+    "%does not look like a valid URI%"
     ]
 
 revalidate_patterns = [
-    "%missing%`license`%"
+    "%missing%`license`%",
+    "%QC fail%"
 ]
 
 for p in delete_patterns:
-- 
cgit 1.4.1