1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
import re
import schema_salad.schema
import schema_salad.jsonld_context
import json
metadataSchema = '$(inputs.metadataSchema.path)'
metadata = $(inputs.metadata)
subjects = $(inputs.subjects)
dups = json.loads('''$(inputs.dups)''')
originalLabels = $(inputs.originalLabels)
(document_loader,
avsc_names,
schema_metadata,
metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)
for i, m in enumerate(metadata):
doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False)
doc["id"] = subjects[i]
g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx)
print(g.serialize(format="ntriples").decode("utf-8"))
import logging
if dups:
sameseqs = open(dups["path"], "rt")
for d in sameseqs:
logging.warn(d)
g = re.match(r"\\d+\\t(.*)", d)
logging.warn("%s", g.group(1))
sp = g.group(1).split(",")
for n in sp[1:]:
print("<%s> <http://biohackathon.org/bh20-seq-schema/has_duplicate_sequence> <%s> ." % (n.strip(), sp[0].strip()))
orig = open(originalLabels["path"], "rt")
print(orig.read())
|