import re import schema_salad.schema import schema_salad.jsonld_context import json metadataSchema = '$(inputs.metadataSchema.path)' metadata = $(inputs.metadata) subjects = $(inputs.subjects) dups = json.loads('''$(inputs.dups)''') originalLabels = $(inputs.originalLabels) (document_loader, avsc_names, schema_metadata, metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) for i, m in enumerate(metadata): doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False) doc["id"] = subjects[i] g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx) print(g.serialize(format="ntriples").decode("utf-8")) import logging if dups: sameseqs = open(dups["path"], "rt") for d in sameseqs: logging.warn(d) g = re.match(r"\\d+\\t(.*)", d) logging.warn("%s", g.group(1)) sp = g.group(1).split(",") for n in sp[1:]: print("<%s> <%s> ." % (n.strip(), sp[0].strip())) orig = open(originalLabels["path"], "rt") print(orig.read())