about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--workflows/yamlfa2ttl/check_format.cwl23
-rw-r--r--workflows/yamlfa2ttl/check_format.py31
-rw-r--r--workflows/yamlfa2ttl/check_sequence.cwl20
-rw-r--r--workflows/yamlfa2ttl/check_sequence.py63
-rw-r--r--workflows/yamlfa2ttl/yamlfa2ttl.cwl32
5 files changed, 169 insertions, 0 deletions
diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl
new file mode 100644
index 0000000..9eee58a
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_format.cwl
@@ -0,0 +1,23 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+
+inputs:
+  script:
+    type: File
+    inputBinding: {position: 1}
+    default: {class: File, location: check_format.py}
+  path_fasta:
+    type: string
+    inputBinding: {position: 2}
+  format_to_check:
+    type: string
+    inputBinding: {position: 3}
+  path_valid_formats:
+    type: File
+    inputBinding: {position: 4}
+    default: {class: File, location: ../../bh20sequploader/validation/formats}
+
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_format.py b/workflows/yamlfa2ttl/check_format.py
new file mode 100644
index 0000000..4472b18
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_format.py
@@ -0,0 +1,31 @@
+import gzip
+import tempfile
+import magic
+import io
+import sys
+
+path_fasta = sys.argv[1]
+format_to_check = sys.argv[2]
+path_valid_formats = sys.argv[3]
+
+with tempfile.NamedTemporaryFile() as tmp:
+    with open(path_valid_formats, 'rb') as f:
+        tmp.write(f.read())
+    tmp.flush()
+
+    check_format = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True)
+
+with open(path_fasta, "rb") as f:
+    gz = ""
+    if path_fasta.endswith(".gz"):
+        gz = ".gz"
+        f = gzip.GzipFile(fileobj=f, mode='rb')
+
+    f = io.TextIOWrapper(f)
+
+    buffer = f.read(4096)
+    seq_type = check_format.from_buffer(buffer).lower()
+    f.detach()
+
+    if seq_type != format_to_check:
+        raise ValueError(f"Input file ({path_fasta}) does not look like a {format_to_check}")
diff --git a/workflows/yamlfa2ttl/check_sequence.cwl b/workflows/yamlfa2ttl/check_sequence.cwl
new file mode 100644
index 0000000..946948d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.cwl
@@ -0,0 +1,20 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+
+inputs:
+  script:
+    type: File
+    inputBinding: {position: 1}
+    default: {class: File, location: check_sequence.py}
+  path_fasta:
+    type: string
+    inputBinding: {position: 2}
+  path_sars_cov_2_reference_fasta:
+    type: File
+    inputBinding: {position: 3}
+    default: {class: File, location: ../../bh20sequploader/SARS-CoV-2-reference.fasta}
+
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
new file mode 100644
index 0000000..f92bf6d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -0,0 +1,63 @@
+import subprocess
+import tempfile
+import sys
+
+path_fasta = sys.argv[1]
+path_sars_cov_2_reference_fasta = sys.argv[2]
+
+
+def read_single_fasta(path_fasta):
+    with open(path_fasta) as f:
+        entries = 0
+        header = None
+        sequence = []
+        for line in f:
+            line = line.strip()
+
+            if line.startswith(">"):
+                header = line
+                entries += 1
+            else:
+                sequence.append(line)
+
+            if entries > 1:
+                raise ValueError("Input FASTA contains multiple entries")
+
+    return header, ''.join(sequence)
+
+
+print("FASTA QC: checking similarity to the reference")
+
+header, sequence = read_single_fasta(path_fasta)
+
+similarity = 0
+
+with tempfile.NamedTemporaryFile() as tmp_fasta:
+    with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
+        with open(path_sars_cov_2_reference_fasta, 'rb') as f:
+            tmp_sars_cov_2_reference_fasta.write(f.read())
+        tmp_sars_cov_2_reference_fasta.flush()
+
+        tmp_fasta.write(f'>{header}\n'.encode("utf8"))
+        tmp_fasta.write(sequence.encode("utf8"))
+        tmp_fasta.flush()
+
+        cmd = [
+            "minimap2", "-c", "-x", "asm20",
+            tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
+        ]
+        print(" ".join(cmd))
+
+        result = subprocess.run(cmd, stdout=subprocess.PIPE)
+        result.check_returncode()
+
+        paf_output = result.stdout.decode("utf-8")
+        paf_output = paf_output.split("\t")
+        if len(paf_output) >= 10:
+            # Number of matching bases in the mapping / Target sequence length
+            similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0
+
+if similarity < 70.0:
+    raise ValueError(
+        f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
+    )
diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
new file mode 100644
index 0000000..143fc9d
--- /dev/null
+++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
@@ -0,0 +1,32 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: Workflow
+doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)"
+
+inputs:
+  path_fasta:
+    type: string
+    doc: input fasta to validate
+
+  format_to_check:
+    type: string
+    default: text/fasta
+
+steps:
+  check_format:
+    in:
+      path_fasta: path_fasta
+      format_to_check: format_to_check
+    doc: the input has to be a valid FASTA format file
+    out: []
+    run: check_format.cwl
+
+  check_sequence:
+    in:
+      path_fasta: path_fasta
+    doc: the input sequence has to be enough similar to the reference
+    out: []
+    run: check_sequence.cwl
+
+outputs: []