aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader
diff options
context:
space:
mode:
authorPeter Amstutz2020-04-28 15:02:30 -0400
committerPeter Amstutz2020-04-28 15:03:06 -0400
commit518090d92f0d91c08e86ac8b96ea0274376e744b (patch)
treea24224a53135d0bde534d8e38d501fce4854556a /bh20sequploader
parentb4ec03398d5e74eeb33a4a2a396fe4518c0c5465 (diff)
downloadbh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.tar.gz
bh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.tar.lz
bh20-seq-resource-518090d92f0d91c08e86ac8b96ea0274376e744b.zip
Call clustalw to compare sequence to reference
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>
Diffstat (limited to 'bh20sequploader')
-rw-r--r--bh20sequploader/qc_fasta.py63
1 files changed, 55 insertions, 8 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index e47d66b..2600cfa 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -1,6 +1,25 @@
import pkg_resources
import tempfile
import magic
+import subprocess
+import tempfile
+import logging
+import re
+
+def read_fasta(sequence):
+ entries = 0
+ bases = []
+ label = None
+ for line in sequence:
+ if line.startswith(">"):
+ label = line
+ entries += 1
+ else:
+ bases.append(line)
+ if entries > 1:
+ raise ValueError("FASTA file contains multiple entries")
+ break
+ return label, bases
def qc_fasta(sequence):
schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
@@ -13,16 +32,44 @@ def qc_fasta(sequence):
sequence.seek(0)
if seq_type == "text/fasta":
# ensure that contains only one entry
- entries = 0
- for line in sequence:
- if line.startswith(">"):
- entries += 1
- if entries > 1:
- raise ValueError("FASTA file contains multiple entries")
- break
+ submitlabel, submitseq = read_fasta(sequence)
sequence.seek(0)
+
+ with tempfile.NamedTemporaryFile() as tmp1:
+ refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
+ tmp1.write(refstring)
+ tmp1.write(submitlabel.encode("utf8"))
+ tmp1.write(("".join(submitseq)).encode("utf8"))
+ tmp1.flush()
+ try:
+ cmd = ["clustalw", "-infile="+tmp1.name,
+ "-quicktree", "-iteration=none", "-type=DNA"]
+ print("QC checking similarity to reference")
+ print(" ".join(cmd))
+ result = subprocess.run(cmd, capture_output=True)
+ res = result.stdout.decode("utf-8")
+ g1 = re.search(r"^Sequence 1: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE)
+ refbp = float(g1.group(1))
+ g2 = re.search(r"^Sequence 2: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE)
+ subbp = float(g2.group(1))
+ g3 = re.search(r"^Sequences \(1:2\) Aligned\. Score: (\d+(\.\d+)?)$", res, flags=re.MULTILINE)
+ similarity = float(g3.group(1))
+
+ print(g1.group(0))
+ print(g2.group(0))
+ print(g3.group(0))
+ except Exception as e:
+ logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
+
+ if (subbp/refbp) < .7:
+ raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
+ if (subbp/refbp) > 1.3:
+ raise ValueError("QC fail: submit sequence length is greater than 130% reference")
+ if similarity < 70.0:
+ raise ValueError("QC fail: submit similarity is less than 70%")
+
return "sequence.fasta"
elif seq_type == "text/fastq":
return "reads.fastq"
else:
- raise ValueError("Sequence file does not look like FASTA or FASTQ")
+ raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")