diff options
-rw-r--r-- | bh20sequploader/qc_fasta.py | 63 | ||||
-rw-r--r-- | setup.py | 3 | ||||
-rw-r--r-- | workflows/pangenome-generate/odgi_to_rdf.cwl | 2 |
3 files changed, 58 insertions, 10 deletions
diff --git a/bh20sequploader/qc_fasta.py b/bh20sequploader/qc_fasta.py index e47d66b..2600cfa 100644 --- a/bh20sequploader/qc_fasta.py +++ b/bh20sequploader/qc_fasta.py @@ -1,6 +1,25 @@ import pkg_resources import tempfile import magic +import subprocess +import tempfile +import logging +import re + +def read_fasta(sequence): + entries = 0 + bases = [] + label = None + for line in sequence: + if line.startswith(">"): + label = line + entries += 1 + else: + bases.append(line) + if entries > 1: + raise ValueError("FASTA file contains multiple entries") + break + return label, bases def qc_fasta(sequence): schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") @@ -13,16 +32,44 @@ def qc_fasta(sequence): sequence.seek(0) if seq_type == "text/fasta": # ensure that contains only one entry - entries = 0 - for line in sequence: - if line.startswith(">"): - entries += 1 - if entries > 1: - raise ValueError("FASTA file contains multiple entries") - break + submitlabel, submitseq = read_fasta(sequence) sequence.seek(0) + + with tempfile.NamedTemporaryFile() as tmp1: + refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta") + tmp1.write(refstring) + tmp1.write(submitlabel.encode("utf8")) + tmp1.write(("".join(submitseq)).encode("utf8")) + tmp1.flush() + try: + cmd = ["clustalw", "-infile="+tmp1.name, + "-quicktree", "-iteration=none", "-type=DNA"] + print("QC checking similarity to reference") + print(" ".join(cmd)) + result = subprocess.run(cmd, capture_output=True) + res = result.stdout.decode("utf-8") + g1 = re.search(r"^Sequence 1: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + refbp = float(g1.group(1)) + g2 = re.search(r"^Sequence 2: [^ ]+ +(\d+) bp$", res, flags=re.MULTILINE) + subbp = float(g2.group(1)) + g3 = re.search(r"^Sequences \(1:2\) Aligned\. Score: (\d+(\.\d+)?)$", res, flags=re.MULTILINE) + similarity = float(g3.group(1)) + + print(g1.group(0)) + print(g2.group(0)) + print(g3.group(0)) + except Exception as e: + logging.warn("Error trying to QC against reference sequence using 'clustalw': %s", e) + + if (subbp/refbp) < .7: + raise ValueError("QC fail: submit sequence length is shorter than 70% reference") + if (subbp/refbp) > 1.3: + raise ValueError("QC fail: submit sequence length is greater than 130% reference") + if similarity < 70.0: + raise ValueError("QC fail: submit similarity is less than 70%") + return "sequence.fasta" elif seq_type == "text/fastq": return "reads.fastq" else: - raise ValueError("Sequence file does not look like FASTA or FASTQ") + raise ValueError("Sequence file does not look like a DNA FASTA or FASTQ") @@ -34,7 +34,8 @@ setup( package_data={"bh20sequploader": ["bh20seq-schema.yml", "bh20seq-options.yml", "bh20seq-shex.rdf", - "validation/formats"], + "validation/formats", + "SARS-CoV-2-reference.fasta",], }, install_requires=install_requires, extras_require={ diff --git a/workflows/pangenome-generate/odgi_to_rdf.cwl b/workflows/pangenome-generate/odgi_to_rdf.cwl index 079d6fb..7f91509 100644 --- a/workflows/pangenome-generate/odgi_to_rdf.cwl +++ b/workflows/pangenome-generate/odgi_to_rdf.cwl @@ -3,7 +3,7 @@ class: CommandLineTool cwlVersion: v1.1 hints: DockerRequirement: - dockerPull: spodgi/spodgi + dockerPull: jerven/spodgi requirements: InlineJavascriptRequirement: {} ShellCommandRequirement: {} |