diff options
author | Peter Amstutz | 2020-06-22 15:06:10 +0000 |
---|---|---|
committer | Peter Amstutz | 2020-06-22 15:06:10 +0000 |
commit | 7daa9ff2cdba742a811db00c924ccde25fa2c9b6 (patch) | |
tree | f6f001d224c82825c08b091f221f85897f60d40d /bh20sequploader/qc_fasta.py | |
parent | 1554fb6b4daf263f034d46a5f5b26ebcc3e69d22 (diff) | |
download | bh20-seq-resource-7daa9ff2cdba742a811db00c924ccde25fa2c9b6.tar.gz bh20-seq-resource-7daa9ff2cdba742a811db00c924ccde25fa2c9b6.tar.lz bh20-seq-resource-7daa9ff2cdba742a811db00c924ccde25fa2c9b6.zip |
Handle upload & assembly of gzipped, paired-end fastq
Diffstat (limited to 'bh20sequploader/qc_fasta.py')
-rw-r--r-- | bh20sequploader/qc_fasta.py | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index 5c8cf3a..607c8c0 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -5,6 +5,8 @@ import subprocess import tempfile import logging import re +import io +import gzip log = logging.getLogger(__name__ ) @@ -23,7 +25,7 @@ def read_fasta(sequence): raise ValueError("FASTA file contains multiple entries") return label, bases -def qc_fasta(sequence): +def qc_fasta(arg_sequence): log.debug("Starting qc_fasta") schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") with tempfile.NamedTemporaryFile() as tmp: @@ -31,12 +33,24 @@ def qc_fasta(sequence): tmp.flush() val = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True) - seq_type = val.from_buffer(sequence.read(4096)).lower() + + gz = "" + if arg_sequence.name.endswith(".gz"): + sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb') + gz = ".gz" + else: + sequence = arg_sequence + + sequence = io.TextIOWrapper(sequence) + r = sequence.read(4096) sequence.seek(0) + + seqlabel = r[1:r.index("\n")] + seq_type = val.from_buffer(r).lower() + if seq_type == "text/fasta": # ensure that contains only one entry submitlabel, submitseq = read_fasta(sequence) - sequence.seek(0) with tempfile.NamedTemporaryFile() as tmp1: refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta") @@ -71,8 +85,8 @@ def qc_fasta(sequence): if similarity < 70.0: raise ValueError("QC fail: submit similarity is less than 70%") - return "sequence.fasta" + return ("sequence.fasta"+gz, seqlabel) elif seq_type == "text/fastq": - return "reads.fastq" + return ("reads.fastq"+gz, seqlabel) else: raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ") |