aboutsummaryrefslogtreecommitdiff
path: root/scripts/cleanup.py
blob: 691930572742b696fce8ffbf39cdcaf7fa8b043b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import arvados
import arvados.util
import arvados.keep
import ruamel.yaml

api = arvados.api()
keepclient = arvados.keep.KeepClient(api_client=api)

UPLOADER_PROJECT = 'lugli-j7d0g-n5clictpuvwk8aa'
VALIDATED_PROJECT = 'lugli-j7d0g-5ct8p1i1wrgyjvp'

delete_patterns = [
    "%missing%`collection_location`%",
    "%missing%`technology`%",
    "%missing%`host_species`%",
    "%QC fail: alignment%",
    "%does not look like a valid URI%",
    "%Duplicate of%",
    "%No matching triples found for predicate obo:NCIT_C42781%",
    "%does not look like a valid URI%"
    ]

revalidate_patterns = [
    "%missing%`license`%",
    "%QC fail%"
]

for p in delete_patterns:
    c = arvados.util.list_all(api.collections().list, filters=[
        ["owner_uuid", "=", UPLOADER_PROJECT],
        ["properties.errors", "like", p]])
    for i in c:
        print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
        api.collections().delete(uuid=i["uuid"]).execute()

for p in revalidate_patterns:
    c = arvados.util.list_all(api.collections().list, filters=[
        ["owner_uuid", "=", UPLOADER_PROJECT],
        ["properties.errors", "like", p]])
    for i in c:
        print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
        pr = i["properties"]
        if "status" in pr:
            del pr["status"]
        if "errors" in pr:
            del pr["errors"]
        api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()

c = arvados.util.list_all(api.collections().list, filters=[
    ["owner_uuid", "=", VALIDATED_PROJECT],
    ["properties.sequence_label", "exists", False]])
for i in c:
    col = arvados.collection.Collection(i["uuid"], api_client=api, keep_client=keepclient)
    with col.open("metadata.yaml") as md:
        metadata_content = ruamel.yaml.round_trip_load(md)
    colprop = col.get_properties()
    colprop["sequence_label"] = metadata_content["sample"]["sample_id"]

    print("fixing sequence label %s %s" % (i["uuid"], colprop.get("sequence_label")))
    api.collections().update(uuid=i["uuid"], body={"properties": colprop}).execute()