aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2021-01-07 23:02:54 +0100
committerAndreaGuarracino2021-01-07 23:02:54 +0100
commit141e619929cee17018417d71111063015e73c366 (patch)
treebaa8f98aaed7aef615d37b5560a988381ac8030f
parent430090a65786b0b9dde9e1319a740f95579d0275 (diff)
downloadbh20-seq-resource-141e619929cee17018417d71111063015e73c366.tar.gz
bh20-seq-resource-141e619929cee17018417d71111063015e73c366.tar.lz
bh20-seq-resource-141e619929cee17018417d71111063015e73c366.zip
added check_sequence workflow and script
-rw-r--r--workflows/yamlfa2ttl/check_format.cwl3
-rw-r--r--workflows/yamlfa2ttl/check_sequence.cwl20
-rw-r--r--workflows/yamlfa2ttl/check_sequence.py63
-rw-r--r--workflows/yamlfa2ttl/yamlfa2ttl.cwl8
4 files changed, 93 insertions, 1 deletions
diff --git a/workflows/yamlfa2ttl/check_format.cwl b/workflows/yamlfa2ttl/check_format.cwl
index 24de620..9eee58a 100644
--- a/workflows/yamlfa2ttl/check_format.cwl
+++ b/workflows/yamlfa2ttl/check_format.cwl
@@ -3,6 +3,7 @@
cwlVersion: v1.1
class: CommandLineTool
baseCommand: python3
+
inputs:
script:
type: File
@@ -18,5 +19,5 @@ inputs:
type: File
inputBinding: {position: 4}
default: {class: File, location: ../../bh20sequploader/validation/formats}
-outputs: []
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_sequence.cwl b/workflows/yamlfa2ttl/check_sequence.cwl
new file mode 100644
index 0000000..946948d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.cwl
@@ -0,0 +1,20 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.1
+class: CommandLineTool
+baseCommand: python3
+
+inputs:
+ script:
+ type: File
+ inputBinding: {position: 1}
+ default: {class: File, location: check_sequence.py}
+ path_fasta:
+ type: string
+ inputBinding: {position: 2}
+ path_sars_cov_2_reference_fasta:
+ type: File
+ inputBinding: {position: 3}
+ default: {class: File, location: ../../bh20sequploader/SARS-CoV-2-reference.fasta}
+
+outputs: []
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
new file mode 100644
index 0000000..f92bf6d
--- /dev/null
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -0,0 +1,63 @@
+import subprocess
+import tempfile
+import sys
+
+path_fasta = sys.argv[1]
+path_sars_cov_2_reference_fasta = sys.argv[2]
+
+
+def read_single_fasta(path_fasta):
+ with open(path_fasta) as f:
+ entries = 0
+ header = None
+ sequence = []
+ for line in f:
+ line = line.strip()
+
+ if line.startswith(">"):
+ header = line
+ entries += 1
+ else:
+ sequence.append(line)
+
+ if entries > 1:
+ raise ValueError("Input FASTA contains multiple entries")
+
+ return header, ''.join(sequence)
+
+
+print("FASTA QC: checking similarity to the reference")
+
+header, sequence = read_single_fasta(path_fasta)
+
+similarity = 0
+
+with tempfile.NamedTemporaryFile() as tmp_fasta:
+ with tempfile.NamedTemporaryFile() as tmp_sars_cov_2_reference_fasta:
+ with open(path_sars_cov_2_reference_fasta, 'rb') as f:
+ tmp_sars_cov_2_reference_fasta.write(f.read())
+ tmp_sars_cov_2_reference_fasta.flush()
+
+ tmp_fasta.write(f'>{header}\n'.encode("utf8"))
+ tmp_fasta.write(sequence.encode("utf8"))
+ tmp_fasta.flush()
+
+ cmd = [
+ "minimap2", "-c", "-x", "asm20",
+ tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
+ ]
+ print(" ".join(cmd))
+
+ result = subprocess.run(cmd, stdout=subprocess.PIPE)
+ result.check_returncode()
+
+ paf_output = result.stdout.decode("utf-8")
+ paf_output = paf_output.split("\t")
+ if len(paf_output) >= 10:
+ # Number of matching bases in the mapping / Target sequence length
+ similarity = (float(paf_output[9]) / float(paf_output[6])) * 100.0
+
+if similarity < 70.0:
+ raise ValueError(
+ f"FASTA QC fail for '{header}': the similarity against the reference {similarity} was less than 70%')"
+ )
diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
index 1dce9ca..143fc9d 100644
--- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl
+++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
@@ -18,7 +18,15 @@ steps:
in:
path_fasta: path_fasta
format_to_check: format_to_check
+ doc: the input has to be a valid FASTA format file
out: []
run: check_format.cwl
+ check_sequence:
+ in:
+ path_fasta: path_fasta
+ doc: the input sequence has to be enough similar to the reference
+ out: []
+ run: check_sequence.cwl
+
outputs: []