diff options
author | Peter Amstutz | 2020-06-22 16:32:54 +0000 |
---|---|---|
committer | Peter Amstutz | 2020-06-22 16:32:54 +0000 |
commit | 4001368ab29c988e94dddd29767c4b64a5bd2a5b (patch) | |
tree | 5642bf6a5cd02bc76b0aa98ae086b010f5131cf0 | |
parent | 7daa9ff2cdba742a811db00c924ccde25fa2c9b6 (diff) | |
download | bh20-seq-resource-4001368ab29c988e94dddd29767c4b64a5bd2a5b.tar.gz bh20-seq-resource-4001368ab29c988e94dddd29767c4b64a5bd2a5b.tar.lz bh20-seq-resource-4001368ab29c988e94dddd29767c4b64a5bd2a5b.zip |
Better invalid sequence QC handling
-rw-r--r-- | bh20seqanalyzer/main.py | 36 | ||||
-rw-r--r-- | bh20sequploader/qc_fasta.py | 11 |
2 files changed, 27 insertions, 20 deletions
diff --git a/bh20seqanalyzer/main.py b/bh20seqanalyzer/main.py index 794ce27..9164190 100644 --- a/bh20seqanalyzer/main.py +++ b/bh20seqanalyzer/main.py @@ -39,22 +39,25 @@ def validate_upload(api, collection, validated_project, logging.warn("Failed metadata qc") if valid: - tgt = None - paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"} - for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): - if n not in col: - continue - with col.open(n, 'rb') as qf: - tgt = qc_fasta(qf)[0] - if tgt != n and tgt != paired.get(n): - logging.info("Expected %s but magic says it should be %s", n, tgt) - valid = False - elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): - start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n) - return False - if tgt is None: + try: + tgt = None + paired = {"reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz"} + for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): + if n not in col: + continue + with col.open(n, 'rb') as qf: + tgt = qc_fasta(qf)[0] + if tgt != n and tgt != paired.get(n): + logging.info("Expected %s but magic says it should be %s", n, tgt) + valid = False + elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): + start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n) + return False + if tgt is None: + valid = False + logging.warn("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"]) + except ValueError as v: valid = False - logging.warn("Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"]) dup = api.collections().list(filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() @@ -70,9 +73,8 @@ def validate_upload(api, collection, validated_project, "owner_uuid": validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))}).execute() else: - pass # It is invalid, delete it. - #logging.warn("Deleting '%s'" % collection["name"]) + logging.warn("Suggest deleting '%s'" % collection["name"]) #api.collections().delete(uuid=collection["uuid"]).execute() return valid diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index 607c8c0..e198430 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -58,6 +58,9 @@ def qc_fasta(arg_sequence): tmp1.write(submitlabel.encode("utf8")) tmp1.write(("".join(submitseq)).encode("utf8")) tmp1.flush() + subbp = 0 + refbp = 0 + similarity = 0 try: cmd = ["clustalw", "-infile="+tmp1.name, "-quicktree", "-iteration=none", "-type=DNA"] @@ -78,12 +81,14 @@ def qc_fasta(arg_sequence): except Exception as e: logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e) - if (subbp/refbp) < .7: + if refbp and (subbp/refbp) < .7: raise ValueError("QC fail: submit sequence length is shorter than 70% reference") - if (subbp/refbp) > 1.3: + if refbp and (subbp/refbp) > 1.3: raise ValueError("QC fail: submit sequence length is greater than 130% reference") - if similarity < 70.0: + if similarity and similarity < 70.0: raise ValueError("QC fail: submit similarity is less than 70%") + if refbp == 0 or similarity == 0: + raise ValueError("QC fail") return ("sequence.fasta"+gz, seqlabel) elif seq_type == "text/fastq": |