aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader/qc_fasta.py
diff options
context:
space:
mode:
Diffstat (limited to 'bh20sequploader/qc_fasta.py')
-rw-r--r--bh20sequploader/qc_fasta.py35
1 files changed, 27 insertions, 8 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py
index 5c8cf3a..e198430 100644
--- a/bh20sequploader/qc_fasta.py
+++ b/bh20sequploader/qc_fasta.py
@@ -5,6 +5,8 @@ import subprocess
import tempfile
import logging
import re
+import io
+import gzip
log = logging.getLogger(__name__ )
@@ -23,7 +25,7 @@ def read_fasta(sequence):
raise ValueError("FASTA file contains multiple entries")
return label, bases
-def qc_fasta(sequence):
+def qc_fasta(arg_sequence):
log.debug("Starting qc_fasta")
schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
with tempfile.NamedTemporaryFile() as tmp:
@@ -31,12 +33,24 @@ def qc_fasta(sequence):
tmp.flush()
val = magic.Magic(magic_file=tmp.name,
uncompress=False, mime=True)
- seq_type = val.from_buffer(sequence.read(4096)).lower()
+
+ gz = ""
+ if arg_sequence.name.endswith(".gz"):
+ sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb')
+ gz = ".gz"
+ else:
+ sequence = arg_sequence
+
+ sequence = io.TextIOWrapper(sequence)
+ r = sequence.read(4096)
sequence.seek(0)
+
+ seqlabel = r[1:r.index("\n")]
+ seq_type = val.from_buffer(r).lower()
+
if seq_type == "text/fasta":
# ensure that contains only one entry
submitlabel, submitseq = read_fasta(sequence)
- sequence.seek(0)
with tempfile.NamedTemporaryFile() as tmp1:
refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
@@ -44,6 +58,9 @@ def qc_fasta(sequence):
tmp1.write(submitlabel.encode("utf8"))
tmp1.write(("".join(submitseq)).encode("utf8"))
tmp1.flush()
+ subbp = 0
+ refbp = 0
+ similarity = 0
try:
cmd = ["clustalw", "-infile="+tmp1.name,
"-quicktree", "-iteration=none", "-type=DNA"]
@@ -64,15 +81,17 @@ def qc_fasta(sequence):
except Exception as e:
logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e)
- if (subbp/refbp) < .7:
+ if refbp and (subbp/refbp) < .7:
raise ValueError("QC fail: submit sequence length is shorter than 70% reference")
- if (subbp/refbp) > 1.3:
+ if refbp and (subbp/refbp) > 1.3:
raise ValueError("QC fail: submit sequence length is greater than 130% reference")
- if similarity < 70.0:
+ if similarity and similarity < 70.0:
raise ValueError("QC fail: submit similarity is less than 70%")
+ if refbp == 0 or similarity == 0:
+ raise ValueError("QC fail")
- return "sequence.fasta"
+ return ("sequence.fasta"+gz, seqlabel)
elif seq_type == "text/fastq":
- return "reads.fastq"
+ return ("reads.fastq"+gz, seqlabel)
else:
raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ")