about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAndreaGuarracino2020-08-28 11:16:24 +0200
committerAndreaGuarracino2020-08-28 11:16:24 +0200
commit3165a31e321cbf4641f9afdcbea511ee66f673bb (patch)
tree7140a760be5903c6f5e63a38da7e0f76493530a8
parentcc8f99d50236b7d0c365990398785ecc319323ea (diff)
downloadbh20-seq-resource-3165a31e321cbf4641f9afdcbea511ee66f673bb.tar.gz
bh20-seq-resource-3165a31e321cbf4641f9afdcbea511ee66f673bb.tar.lz
bh20-seq-resource-3165a31e321cbf4641f9afdcbea511ee66f673bb.zip
added control (locally and in the validation) that sample_id has to be the same in the metadata and in the FASTA header #103
-rw-r--r--bh20seqanalyzer/main.py6
-rw-r--r--bh20sequploader/main.py13
-rw-r--r--bh20sequploader/qc_metadata.py2
-rwxr-xr-xscripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py2
4 files changed, 18 insertions, 5 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py
index 11349e5..5f00080 100644
--- a/bh20seqanalyzer/main.py
+++ b/bh20seqanalyzer/main.py
@@ -83,12 +83,16 @@ class SeqAnalyzer:
                     if n not in col:
                         continue
                     with col.open(n, 'rb') as qf:
-                        tgt = qc_fasta(qf)[0]
+                        tgt, seqlabel, seq_type = qc_fasta(qf)
                         if tgt != n and tgt != paired.get(n):
                             errors.append("Expected %s but magic says it should be %s", n, tgt)
                         elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"):
                             self.start_fastq_to_fasta(collection, n, sample_id)
                             return False
+
+                        # If it is a FASTA
+                        if sample_id != seqlabel:
+                            errors.append("Expected sample_id == seqlabel, but %s != %s" % (sample_id, seqlabel))
                 if tgt is None:
                     errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"])
             except Exception as v:
diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py
index 80c33c8..d2a6c5f 100644
--- a/bh20sequploader/main.py
+++ b/bh20sequploader/main.py
@@ -26,11 +26,14 @@ VALIDATED_PROJECT='lugli-j7d0g-5ct8p1i1wrgyjvp'
 
 def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
     failed = False
+    sample_id = ''
     try:
         log.debug("Checking metadata" if do_qc else "Skipping metadata check")
-        if do_qc and not qc_metadata(metadata.name):
-            log.warning("Failed metadata QC")
-            failed = True
+        if do_qc:
+            sample_id = qc_metadata(metadata.name)
+            if not sample_id:
+                log.warning("Failed metadata QC")
+                failed = True
     except Exception as e:
         log.exception("Failed metadata QC")
         failed = True
@@ -46,6 +49,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
 
             target[0] = ("reads_1."+target[0][0][6:], target[0][1])
             target[1] = ("reads_2."+target[1][0][6:], target[0][1])
+
+        if target[0][2] == 'text/fasta' and sample_id != target[0][1]:
+            raise ValueError("The sample_id field in the metadata must be the same as the FASTA header")
+
     except Exception as e:
         log.exception("Failed sequence QC")
         failed = True
diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py
index 27657b1..87fa0b3 100644
--- a/bh20sequploader/qc_metadata.py
+++ b/bh20sequploader/qc_metadata.py
@@ -39,4 +39,4 @@ def qc_metadata(metadatafile):
     if not rslt:
         raise Exception(reason)
 
-    return True
+    return metadata['sample']['sample_id']
diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
index 8ef76e1..8f765d7 100755
--- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -412,6 +412,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                 not_created_accession_dict[accession_version].append('host_species not found')
 
             if len(GBSeq_sequence.text) < min_len_to_count:
+                if accession_version not in not_created_accession_dict:
+                    not_created_accession_dict[accession_version] = []
                 not_created_accession_dict[accession_version].append('sequence shorter than {} bp'.format(min_len_to_count))
 
             if accession_version not in not_created_accession_dict: