aboutsummaryrefslogtreecommitdiff
path: root/workflows/yamlfa2ttl/check_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/yamlfa2ttl/check_metadata.py')
-rw-r--r--workflows/yamlfa2ttl/check_metadata.py28
1 files changed, 24 insertions, 4 deletions
diff --git a/workflows/yamlfa2ttl/check_metadata.py b/workflows/yamlfa2ttl/check_metadata.py
index 05494ca..6dd0d5b 100644
--- a/workflows/yamlfa2ttl/check_metadata.py
+++ b/workflows/yamlfa2ttl/check_metadata.py
@@ -6,12 +6,14 @@ import schema_salad.jsonld_context
from pyshex.evaluate import evaluate
path_yaml = sys.argv[1]
-path_schema_yaml = sys.argv[2]
-path_shex_rdf = sys.argv[3]
+path_fasta = sys.argv[2]
+path_schema_yaml = sys.argv[3]
+path_shex_rdf = sys.argv[4]
with open(path_schema_yaml, "rb") as f:
cache = {
- "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode("utf-8")
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode(
+ "utf-8")
}
metadata_schema = schema_salad.schema.load_schema(
@@ -29,7 +31,10 @@ with open(path_shex_rdf, "rb") as f:
doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, path_yaml, True)
g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
-rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
+rslt, reason = evaluate(
+ g, shex, doc["id"],
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape"
+)
# As part of QC make sure serialization works too, this will raise
# an exception if there are invalid URIs.
@@ -37,3 +42,18 @@ g.serialize(format="ntriples")
if not rslt:
raise Exception(reason)
+
+# The sample_id in the FASTA header has to equal to the sample_id in the YAML file
+sample_id_from_metadata = metadata['sample']['sample_id']
+
+sample_id_from_fasta = ''
+
+with open(path_fasta) as f:
+ for line in f:
+ sample_id_from_fasta = line.strip().split(' ')[0][1:]
+ break
+
+if sample_id_from_metadata != sample_id_from_fasta:
+ raise ValueError(
+ f"sample_id in the YAML file '{sample_id_from_metadata}' is different from the sample_id in the FASTA '{sample_id_from_fasta}'"
+ )