aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreaGuarracino2021-01-08 23:44:47 +0100
committerAndreaGuarracino2021-01-08 23:44:47 +0100
commit53a47a18a45270c3f037fabe9cc973c66bfe50bf (patch)
tree1fb71fae0d629127a5da6f9ad879fff259016b14
parent2ddf72a4028475eb65dfeab153f3565e698d5338 (diff)
downloadbh20-seq-resource-yamlfa2ttl.tar.gz
bh20-seq-resource-yamlfa2ttl.tar.lz
bh20-seq-resource-yamlfa2ttl.zip
sample_id in the FASTA has to match the sample_id in the YAMLyamlfa2ttl
-rw-r--r--workflows/yamlfa2ttl/check_metadata.cwl7
-rw-r--r--workflows/yamlfa2ttl/check_metadata.py28
-rw-r--r--workflows/yamlfa2ttl/check_sequence.py4
-rw-r--r--workflows/yamlfa2ttl/yamlfa2ttl.cwl20
4 files changed, 38 insertions, 21 deletions
diff --git a/workflows/yamlfa2ttl/check_metadata.cwl b/workflows/yamlfa2ttl/check_metadata.cwl
index 72c4d36..593155b 100644
--- a/workflows/yamlfa2ttl/check_metadata.cwl
+++ b/workflows/yamlfa2ttl/check_metadata.cwl
@@ -12,13 +12,16 @@ inputs:
path_yaml:
type: string
inputBinding: {position: 2}
+ path_fasta:
+ type: string
+ inputBinding: {position: 3}
path_schema_yaml:
type: File
- inputBinding: {position: 3}
+ inputBinding: {position: 4}
default: {class: File, location: ../../bh20sequploader/bh20seq-schema.yml}
path_shex_rdf:
type: File
- inputBinding: {position: 4}
+ inputBinding: {position: 5}
default: {class: File, location: ../../bh20sequploader/bh20seq-shex.rdf}
outputs: []
diff --git a/workflows/yamlfa2ttl/check_metadata.py b/workflows/yamlfa2ttl/check_metadata.py
index 05494ca..6dd0d5b 100644
--- a/workflows/yamlfa2ttl/check_metadata.py
+++ b/workflows/yamlfa2ttl/check_metadata.py
@@ -6,12 +6,14 @@ import schema_salad.jsonld_context
from pyshex.evaluate import evaluate
path_yaml = sys.argv[1]
-path_schema_yaml = sys.argv[2]
-path_shex_rdf = sys.argv[3]
+path_fasta = sys.argv[2]
+path_schema_yaml = sys.argv[3]
+path_shex_rdf = sys.argv[4]
with open(path_schema_yaml, "rb") as f:
cache = {
- "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode("utf-8")
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode(
+ "utf-8")
}
metadata_schema = schema_salad.schema.load_schema(
@@ -29,7 +31,10 @@ with open(path_shex_rdf, "rb") as f:
doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, path_yaml, True)
g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
-rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
+rslt, reason = evaluate(
+ g, shex, doc["id"],
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape"
+)
# As part of QC make sure serialization works too, this will raise
# an exception if there are invalid URIs.
@@ -37,3 +42,18 @@ g.serialize(format="ntriples")
if not rslt:
raise Exception(reason)
+
+# The sample_id in the FASTA header has to equal to the sample_id in the YAML file
+sample_id_from_metadata = metadata['sample']['sample_id']
+
+sample_id_from_fasta = ''
+
+with open(path_fasta) as f:
+ for line in f:
+ sample_id_from_fasta = line.strip().split(' ')[0][1:]
+ break
+
+if sample_id_from_metadata != sample_id_from_fasta:
+ raise ValueError(
+ f"sample_id in the YAML file '{sample_id_from_metadata}' is different from the sample_id in the FASTA '{sample_id_from_fasta}'"
+ )
diff --git a/workflows/yamlfa2ttl/check_sequence.py b/workflows/yamlfa2ttl/check_sequence.py
index f92bf6d..58a65b9 100644
--- a/workflows/yamlfa2ttl/check_sequence.py
+++ b/workflows/yamlfa2ttl/check_sequence.py
@@ -26,7 +26,7 @@ def read_single_fasta(path_fasta):
return header, ''.join(sequence)
-print("FASTA QC: checking similarity to the reference")
+print("FASTA QC: checking similarity to the reference", file=sys.stderr)
header, sequence = read_single_fasta(path_fasta)
@@ -46,7 +46,7 @@ with tempfile.NamedTemporaryFile() as tmp_fasta:
"minimap2", "-c", "-x", "asm20",
tmp_sars_cov_2_reference_fasta.name, tmp_fasta.name
]
- print(" ".join(cmd))
+ print(" ".join(cmd), file=sys.stderr)
result = subprocess.run(cmd, stdout=subprocess.PIPE)
result.check_returncode()
diff --git a/workflows/yamlfa2ttl/yamlfa2ttl.cwl b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
index 2913e99..563f00f 100644
--- a/workflows/yamlfa2ttl/yamlfa2ttl.cwl
+++ b/workflows/yamlfa2ttl/yamlfa2ttl.cwl
@@ -1,10 +1,3 @@
-~/.config/guix/current/bin/guix environment -C guix --ad-hoc cwltool python python-biopython python-requests python-dateutil python-magic ruby
-cwltool --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta
-
-cwltool --no-container --preserve-environment GUIX_ENVIRONMENT --preserve-environment PYTHONPATH yamlfa2ttl.cwl --path_fasta ~/bh20-seq-resource/example/sequence.fasta
-
-
-
#!/usr/bin/env cwl-runner
cwlVersion: v1.1
@@ -12,7 +5,7 @@ class: Workflow
doc: "Workflow to go from YAML (metadata) + FASTA (sequence) to TTL (metadata)"
inputs:
- path_fasta:
+ path-fasta:
type: string
doc: input FASTA to validate
@@ -20,14 +13,14 @@ inputs:
type: string
default: text/fasta
- path_yaml:
+ path-yaml:
type: string
doc: input YAML to validate and convert in TTL
steps:
check_format:
in:
- path_fasta: path_fasta
+ path_fasta: path-fasta
format_to_check: format_to_check
doc: the input has to be a valid FASTA format file
out: []
@@ -35,15 +28,16 @@ steps:
check_sequence:
in:
- path_fasta: path_fasta
+ path_fasta: path-fasta
doc: the input sequence has to be enough similar to the reference
out: []
run: check_sequence.cwl
check_metadata:
in:
- path_yaml: path_yaml
- doc: the input metadata information to put in the knowledge graph
+ path_yaml: path-yaml
+ path_fasta: path-fasta
+ doc: check the input metadata information to put in the knowledge graph
out: []
run: check_metadata.cwl