aboutsummaryrefslogtreecommitdiff
path: root/workflows/pangenome-generate/collect-seqs.py
blob: 9a895491bd907f03f80b16be1ce90647a0126f7f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import arvados
import json
import shutil
import logging
import subprocess
import arvados.collection
import ruamel.yaml
import schema_salad.schema
import schema_salad.jsonld_context
from schema_salad.sourceline import add_lc_filename

api = arvados.api()
keepclient = arvados.keep.KeepClient(api_client=api)

validated = arvados.util.list_all(api.collections().list, filters=[
    ["owner_uuid", "=", sys.argv[1]],
    ["properties.status", "=", "validated"]])

validated.sort(key=lambda v: v["portable_data_hash"])

relabeled_fasta = open("relabeledSeqs.fasta", "wt")
merged_metadata = open("mergedMetadata.ttl", "wt")

metadataSchema = sys.argv[2]

blacklist = set()
if len(sys.argv) > 3:
    with open(sys.argv[3]) as bl:
        for l in bl:
            blacklist.add(l.strip())

(document_loader,
 avsc_names,
 schema_metadata,
 metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)


for item in validated:
    pdh = item["portable_data_hash"]
    uuid = item["uuid"]
    try:
        subject = "http://covid19.genenetwork.org/resource/%s" % uuid
        with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col:
            with col.open("metadata.yaml", "rt") as md:
                metadata_content = ruamel.yaml.round_trip_load(md)
                metadata_content["id"] = subject
                add_lc_filename(metadata_content, metadata_content["id"])
                doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False)
                g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx)

            with col.open("sequence.fasta", "rt") as fa:
                label = fa.readline().strip()
                merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"')))
                merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_pdh> \"%s\" .\n" % (subject, pdh))
                merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_version> \"%s\" .\n" % (subject, item["version"]))
                skip = (subject in blacklist or label[1:] in blacklist)
                if skip:
                    merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject)
                if not skip:
                    relabeled_fasta.write(">"+subject+"\n")
                data = fa.read(8096)
                while data:
                    if not skip:
                        relabeled_fasta.write(data)
                    endswithnewline = data.endswith("\n")
                    data = fa.read(8096)
                if not skip and not endswithnewline:
                    relabeled_fasta.write("\n")

            merged_metadata.write(g.serialize(format="ntriples").decode("utf-8"))
    except Exception as e:
        logging.exception("Error processing collection %s" % uuid)

subprocess.run(["samtools", "faidx", "relabeledSeqs.fasta"])

shutil.rmtree(".cache")