diff options
author | Peter Amstutz | 2020-04-28 15:02:30 -0400 |
---|---|---|
committer | Peter Amstutz | 2020-04-28 15:03:06 -0400 |
commit | 518090d92f0d91c08e86ac8b96ea0274376e744b (patch) | |
tree | a24224a53135d0bde534d8e38d501fce4854556a /bh20sequploader | |
parent | b4ec03398d5e74eeb33a4a2a396fe4518c0c5465 (diff) | |
download | bh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.tar.gz bh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.tar.lz bh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.zip |
Call clustalw to compare sequence to reference
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'bh20sequploader')
-rw-r--r-- | bh20sequploader/qc_fasta.py | 63 |
1 files changed, 55 insertions, 8 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index e47d66b..2600cfa 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -1,6 +1,25 @@ import pkg_resources import tempfile import magic +import subprocess +import tempfile +import logging +import re + +def read_fasta(sequence): + entries = 0 + bases = [] + label = None + for line in sequence: + if line.startswith(">"): + label = line + entries += 1 + else: + bases.append(line) + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + return label, bases def qc_fasta(sequence): schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") @@ -13,16 +32,44 @@ def qc_fasta(sequence): sequence.seek(0) if seq_type == "text/fasta": # ensure that contains only one entry - entries = 0 - for line in sequence: - if line.startswith(">"): - entries += 1 - if entries > 1: - raise ValueError("FASTA file contains multiple entries") - break + submitlabel, submitseq = read_fasta(sequence) sequence.seek(0) + + with tempfile.NamedTemporaryFile() as tmp1: + refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta") + tmp1.write(refstring) + tmp1.write(submitlabel.encode("utf8")) + tmp1.write(("".join(submitseq)).encode("utf8")) + tmp1.flush() + try: + cmd = ["clustalw", "-infile="+tmp1.name, + "-quicktree", "-iteration=none", "-type=DNA"] + print("QC checking similarity to reference") + print(" ".join(cmd)) + result = subprocess.run(cmd, capture_output=True) + res = result.stdout.decode("utf-8") + g1 = re.search(r"^Sequence 1: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + refbp = float(g1.group(1)) + g2 = re.search(r"^Sequence 2: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + subbp = float(g2.group(1)) + g3 = re.search(r"^Sequences \(1:2\) Aligned\. Score: (\d+(\.\d+)?)$", res, flags=re.MULTILINE) + similarity = float(g3.group(1)) + + print(g1.group(0)) + print(g2.group(0)) + print(g3.group(0)) + except Exception as e: + logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e) + + if (subbp/refbp) < .7: + raise ValueError("QC fail: submit sequence length is shorter than 70% reference") + if (subbp/refbp) > 1.3: + raise ValueError("QC fail: submit sequence length is greater than 130% reference") + if similarity < 70.0: + raise ValueError("QC fail: submit similarity is less than 70%") + return "sequence.fasta" elif seq_type == "text/fastq": return "reads.fastq" else: - raise ValueError("Sequence file does not look like FASTA or FASTQ") + raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ") |