aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--workflows/yamlfa2ttl/check_format.cwl23
-rw-r--r--workflows/yamlfa2ttl/check_format.py31
-rw-r--r--workflows/yamlfa2ttl/check_sequence.cwl20
-rw-r--r--workflows/yamlfa2ttl/check_sequence.py63
-rw-r--r--workflows/yamlfa2ttl/yamlfa2ttl.cwl32
5 files changed, 169 insertions, 0 deletions
diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl
new file mode 100644
index 0000000..9eee58a
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_format.cwl
@@ -0,0 +1,23 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+
+inputs:
+ script:
+ type: File
+ inputBinding: {position: 1}
+ default: {class: File, location: check_format.py}
+ path_fasta:
+ type: string
+ inputBinding: {position: 2}
+ format_to_check:
+ type: string
+ inputBinding: {position: 3}
+ path_valid_formats:
+ type: File
+ inputBinding: {position: 4}
+ default: {class: File, location: ../../bh20sequploader/validation/formats}
+
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_format.py b/workflows/yamlfa2ttl/check_format.py
new file mode 100644
index 0000000..4472b18
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_format.py
@@ -0,0 +1,31 @@
+import gzip
+import tempfile
+import magic
+import io
+import sys
+
+path_fasta = sys.argv[1]
+format_to_check = sys.argv[2]
+path_valid_formats = sys.argv[3]
+
+with tempfile.NamedTemporaryFile() as tmp:
+ with open(path_valid_formats, 'rb') as f:
+ tmp.write(f.read())
+ tmp.flush()
+
+ check_format = magic.Magic(magic_file=tmp.name, uncompress=False, mime=True)
+
+with open(path_fasta, "rb") as f:
+ gz = ""
+ if path_fasta.endswith(".gz"):
+ gz = ".gz"
+ f = gzip.GzipFile(fileobj=f, mode='rb')
+
+ f = io.TextIOWrapper(f)
+
+ buffer = f.read(4096)
+ seq_type = check_format.from_buffer(buffer).lower()
+ f.detach()
+
+ if seq_type != format_to_check:
+ raise ValueError(f"Input file ({path_fasta}) does not look like a {format_to_check}")
diff --git a/workflows/yamlfa2ttl/check_sequence.cwl b/workflows/yamlfa2ttl/check_sequence.cwl
new file mode 100644
index 0000000..946948d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.cwl
@@ -0,0 +1,20 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+
+inputs:
+ script:
+ type: File
+ inputBinding: {position: 1}
+ default: {class: File, location: check_sequence.py}
+ path_fasta:
+ type: string
+ inputBinding: {position: 2}
+ path_sars_cov_2_reference_fasta:
+ type: File
+ inputBinding: {position: 3}
+ default: {class: File, location: ../../bh20sequploader/SARS-CoV-2-reference.fasta}
+
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
new file mode 100644
index 0000000..f92bf6d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -0,0 +1,63 @@
+import subprocess
+import tempfile
+import sys
+
+path_fasta = sys.argv[1]
+path_sars_cov_2_reference_fasta = sys.argv[2]
+
+
+def read_single_fasta(path_fasta):
+ with open(path_fasta) as f:
+ entries = 0
+ header = None
+ sequence = []
+ for line in f:
+ line = line.strip()
+
+ if line.startswith(">"):
+ header = line
+ entries += 1
+ else:
+ sequence.append(line)
+
+ if entries > 1:
+ raise ValueError("Input FASTA contains multiple entries")
+
+ return header, ''.join(sequence)
+
+
+print("FASTA QC: checking similarity to the reference")
+
+header, sequence = read_single_fasta(path_fasta)
+
+similarity = 0
+
+with tempfile.NamedTemporaryFile() as tmp_fasta:
+ with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
+ with open(path_sars_cov_2_reference_fasta, 'rb') as f:
+ tmp_sars_cov_2_reference_fasta.write(f.read())
+ tmp_sars_cov_2_reference_fasta.flush()
+
+ tmp_fasta.write(f'>{header}\n'.encode("utf8"))
+ tmp_fasta.write(sequence.encode("utf8"))
+ tmp_fasta.flush()
+
+ cmd = [
+ "minimap2", "-c", "-x", "asm20",
+ tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
+ ]
+ print(" ".join(cmd))
+
+ result = subprocess.run(cmd, stdout=subprocess.PIPE)
+ result.check_returncode()
+
+ paf_output = result.stdout.decode("utf-8")
+ paf_output = paf_output.split("\t")
+ if len(paf_output) >= 10:
+ # Number of matching bases in the mapping / Target sequence length
+ similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0
+
+if similarity < 70.0:
+ raise ValueError(
+ f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
+ )
diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
new file mode 100644
index 0000000..143fc9d
--- /dev/null
+++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
@@ -0,0 +1,32 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: Workflow
+doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)"
+
+inputs:
+ path_fasta:
+ type: string
+ doc: input fasta to validate
+
+ format_to_check:
+ type: string
+ default: text/fasta
+
+steps:
+ check_format:
+ in:
+ path_fasta: path_fasta
+ format_to_check: format_to_check
+ doc: the input has to be a valid FASTA format file
+ out: []
+ run: check_format.cwl
+
+ check_sequence:
+ in:
+ path_fasta: path_fasta
+ doc: the input sequence has to be enough similar to the reference
+ out: []
+ run: check_sequence.cwl
+
+outputs: []