From 53a47a18a45270c3f037fabe9cc973c66bfe50bf Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Fri, 8 Jan 2021 23:44:47 +0100
Subject: sample_id in the FASTA has to match the sample_id in the YAML

---
 workflows/yamlfa2ttl/check_metadata.cwl |  7 +++++--
 workflows/yamlfa2ttl/check_metadata.py  | 28 ++++++++++++++++++++++++----
 workflows/yamlfa2ttl/check_sequence.py  |  4 ++--
 workflows/yamlfa2ttl/yamlfa2ttl.cwl     | 20 +++++++-------------
 4 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/workflows/yamlfa2ttl/check_metadata.cwl b/workflows/yamlfa2ttl/check_metadata.cwl
index 72c4d36..593155b 100644
--- a/workflows/yamlfa2ttl/check_metadata.cwl
+++ b/workflows/yamlfa2ttl/check_metadata.cwl
@@ -12,13 +12,16 @@ inputs:
   path_yaml:
     type: string
     inputBinding: {position: 2}
+  path_fasta:
+    type: string
+    inputBinding: {position: 3}
   path_schema_yaml:
     type: File
-    inputBinding: {position: 3}
+    inputBinding: {position: 4}
     default: {class: File, location: ../../bh20sequploader/bh20seq-schema.yml}
   path_shex_rdf:
     type: File
-    inputBinding: {position: 4}
+    inputBinding: {position: 5}
     default: {class: File, location: ../../bh20sequploader/bh20seq-shex.rdf}
 
 outputs: []
diff --git a/workflows/yamlfa2ttl/check_metadata.py b/workflows/yamlfa2ttl/check_metadata.py
index 05494ca..6dd0d5b 100644
--- a/workflows/yamlfa2ttl/check_metadata.py
+++ b/workflows/yamlfa2ttl/check_metadata.py
@@ -6,12 +6,14 @@ import schema_salad.jsonld_context
 from pyshex.evaluate import evaluate
 
 path_yaml = sys.argv[1]
-path_schema_yaml = sys.argv[2]
-path_shex_rdf = sys.argv[3]
+path_fasta = sys.argv[2]
+path_schema_yaml = sys.argv[3]
+path_shex_rdf = sys.argv[4]
 
 with open(path_schema_yaml, "rb") as f:
     cache = {
-        "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode("utf-8")
+        "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode(
+            "utf-8")
     }
 
 metadata_schema = schema_salad.schema.load_schema(
@@ -29,7 +31,10 @@ with open(path_shex_rdf, "rb") as f:
 
 doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, path_yaml, True)
 g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
-rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
+rslt, reason = evaluate(
+    g, shex, doc["id"],
+    "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape"
+)
 
 # As part of QC make sure serialization works too, this will raise
 # an exception if there are invalid URIs.
@@ -37,3 +42,18 @@ g.serialize(format="ntriples")
 
 if not rslt:
     raise Exception(reason)
+
+# The sample_id in the FASTA header has to equal to the sample_id in the YAML file
+sample_id_from_metadata = metadata['sample']['sample_id']
+
+sample_id_from_fasta = ''
+
+with open(path_fasta) as f:
+    for line in f:
+        sample_id_from_fasta = line.strip().split(' ')[0][1:]
+        break
+
+if sample_id_from_metadata != sample_id_from_fasta:
+    raise ValueError(
+        f"sample_id in the YAML file '{sample_id_from_metadata}' is different from the sample_id in the FASTA '{sample_id_from_fasta}'"
+    )
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
index f92bf6d..58a65b9 100644
--- a/workflows/yamlfa2ttl/check_sequence.py
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -26,7 +26,7 @@ def read_single_fasta(path_fasta):
     return header, ''.join(sequence)
 
 
-print("FASTA QC: checking similarity to the reference")
+print("FASTA QC: checking similarity to the reference", file=sys.stderr)
 
 header, sequence = read_single_fasta(path_fasta)
 
@@ -46,7 +46,7 @@ with tempfile.NamedTemporaryFile() as tmp_fasta:
             "minimap2", "-c", "-x", "asm20",
             tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
         ]
-        print(" ".join(cmd))
+        print(" ".join(cmd), file=sys.stderr)
 
         result = subprocess.run(cmd, stdout=subprocess.PIPE)
         result.check_returncode()
diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
index 2913e99..563f00f 100644
--- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl
+++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
@@ -1,10 +1,3 @@
-~/.config/guix/current/bin/guix environment -C guix --ad-hoc cwltool python python-biopython python-requests python-dateutil python-magic ruby
-cwltool --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta
-
-cwltool --no-container --preserve-environment GUIX_ENVIRONMENT --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta
-
-
-
 #!/usr/bin/env cwl-runner
 
 cwlVersion: v1.1
@@ -12,7 +5,7 @@ class: Workflow
 doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)"
 
 inputs:
-  path_fasta:
+  path-fasta:
     type: string
     doc: input FASTA to validate
 
@@ -20,14 +13,14 @@ inputs:
     type: string
     default: text/fasta
 
-  path_yaml:
+  path-yaml:
     type: string
     doc: input YAML to validate and convert in TTL
 
 steps:
   check_format:
     in:
-      path_fasta: path_fasta
+      path_fasta: path-fasta
       format_to_check: format_to_check
     doc: the input has to be a valid FASTA format file
     out: []
@@ -35,15 +28,16 @@ steps:
 
   check_sequence:
     in:
-      path_fasta: path_fasta
+      path_fasta: path-fasta
     doc: the input sequence has to be enough similar to the reference
     out: []
     run: check_sequence.cwl
 
   check_metadata:
     in:
-      path_yaml: path_yaml
-    doc: the input metadata information to put in the knowledge graph
+      path_yaml: path-yaml
+      path_fasta: path-fasta
+    doc: check the input metadata information to put in the knowledge graph
     out: []
     run: check_metadata.cwl
 
-- 
cgit v1.2.3