aboutsummaryrefslogtreecommitdiff
path: root/workflows/yamlfa2ttl/check_metadata.py
diff options
context:
space:
mode:
authorAndreaGuarracino2021-01-08 23:44:47 +0100
committerAndreaGuarracino2021-01-08 23:44:47 +0100
commit53a47a18a45270c3f037fabe9cc973c66bfe50bf (patch)
tree1fb71fae0d629127a5da6f9ad879fff259016b14 /workflows/yamlfa2ttl/check_metadata.py
parent2ddf72a4028475eb65dfeab153f3565e698d5338 (diff)
downloadbh20-seq-resource-yamlfa2ttl.tar.gz
bh20-seq-resource-yamlfa2ttl.tar.lz
bh20-seq-resource-yamlfa2ttl.zip
sample_id in the FASTA has to match the sample_id in the YAMLyamlfa2ttl
Diffstat (limited to 'workflows/yamlfa2ttl/check_metadata.py')
-rw-r--r--workflows/yamlfa2ttl/check_metadata.py28
1 files changed, 24 insertions, 4 deletions
diff --git a/workflows/yamlfa2ttl/check_metadata.py b/workflows/yamlfa2ttl/check_metadata.py
index 05494ca..6dd0d5b 100644
--- a/workflows/yamlfa2ttl/check_metadata.py
+++ b/workflows/yamlfa2ttl/check_metadata.py
@@ -6,12 +6,14 @@ import schema_salad.jsonld_context
from pyshex.evaluate import evaluate
path_yaml = sys.argv[1]
-path_schema_yaml = sys.argv[2]
-path_shex_rdf = sys.argv[3]
+path_fasta = sys.argv[2]
+path_schema_yaml = sys.argv[3]
+path_shex_rdf = sys.argv[4]
with open(path_schema_yaml, "rb") as f:
cache = {
- "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode("utf-8")
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": f.read().decode(
+ "utf-8")
}
metadata_schema = schema_salad.schema.load_schema(
@@ -29,7 +31,10 @@ with open(path_shex_rdf, "rb") as f:
doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, path_yaml, True)
g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
-rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
+rslt, reason = evaluate(
+ g, shex, doc["id"],
+ "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape"
+)
# As part of QC make sure serialization works too, this will raise
# an exception if there are invalid URIs.
@@ -37,3 +42,18 @@ g.serialize(format="ntriples")
if not rslt:
raise Exception(reason)
+
+# The sample_id in the FASTA header has to equal to the sample_id in the YAML file
+sample_id_from_metadata = metadata['sample']['sample_id']
+
+sample_id_from_fasta = ''
+
+with open(path_fasta) as f:
+ for line in f:
+ sample_id_from_fasta = line.strip().split(' ')[0][1:]
+ break
+
+if sample_id_from_metadata != sample_id_from_fasta:
+ raise ValueError(
+ f"sample_id in the YAML file '{sample_id_from_metadata}' is different from the sample_id in the FASTA '{sample_id_from_fasta}'"
+ )