From 3165a31e321cbf4641f9afdcbea511ee66f673bb Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Fri, 28 Aug 2020 11:16:24 +0200 Subject: added control (locally and in the validation) that sample_id has to be the same in the metadata and in the FASTA header #103 --- bh20seqanalyzer/main.py | 6 +++++- bh20sequploader/main.py | 13 ++++++++++--- bh20sequploader/qc_metadata.py | 2 +- .../download_genbank_data/from_genbank_to_fasta_and_yaml.py | 2 ++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 11349e5..5f00080 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -83,12 +83,16 @@ class SeqAnalyzer: if n not in col: continue with col.open(n, 'rb') as qf: - tgt = qc_fasta(qf)[0] + tgt, seqlabel, seq_type = qc_fasta(qf) if tgt != n and tgt != paired.get(n): errors.append("Expected %s but magic says it should be %s", n, tgt) elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): self.start_fastq_to_fasta(collection, n, sample_id) return False + + # If it is a FASTA + if sample_id != seqlabel: + errors.append("Expected sample_id == seqlabel, but %s != %s" % (sample_id, seqlabel)) if tgt is None: errors.append("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"]) except Exception as v: diff --git a/bh20sequploader/main.py b/bh20sequploader/main.py index 80c33c8..d2a6c5f 100644 --- a/bh20sequploader/main.py +++ b/bh20sequploader/main.py @@ -26,11 +26,14 @@ VALIDATED_PROJECT='lugli-j7d0g-5ct8p1i1wrgyjvp' def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): failed = False + sample_id = '' try: log.debug("Checking metadata" if do_qc else "Skipping metadata check") - if do_qc and not qc_metadata(metadata.name): - log.warning("Failed metadata QC") - failed = True + if do_qc: + sample_id = qc_metadata(metadata.name) + if not sample_id: + log.warning("Failed metadata QC") + failed = True except Exception as e: log.exception("Failed metadata QC") failed = True @@ -46,6 +49,10 @@ def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): target[0] = ("reads_1."+target[0][0][6:], target[0][1]) target[1] = ("reads_2."+target[1][0][6:], target[0][1]) + + if target[0][2] == 'text/fasta' and sample_id != target[0][1]: + raise ValueError("The sample_id field in the metadata must be the same as the FASTA header") + except Exception as e: log.exception("Failed sequence QC") failed = True diff --git a/bh20sequploader/qc_metadata.py b/bh20sequploader/qc_metadata.py index 27657b1..87fa0b3 100644 --- a/bh20sequploader/qc_metadata.py +++ b/bh20sequploader/qc_metadata.py @@ -39,4 +39,4 @@ def qc_metadata(metadatafile): if not rslt: raise Exception(reason) - return True + return metadata['sample']['sample_id'] diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py index 8ef76e1..8f765d7 100755 --- a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py +++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py @@ -412,6 +412,8 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) not_created_accession_dict[accession_version].append('host_species not found') if len(GBSeq_sequence.text) < min_len_to_count: + if accession_version not in not_created_accession_dict: + not_created_accession_dict[accession_version] = [] not_created_accession_dict[accession_version].append('sequence shorter than {} bp'.format(min_len_to_count)) if accession_version not in not_created_accession_dict: -- cgit v1.2.3