aboutsummaryrefslogtreecommitdiff
path: root/bh20sequploader/main.py
blob: cdc4c3f7fa57065dcd52585f3527b86cb1b3f38a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import time
import arvados
import arvados.collection
import json
import logging
import magic
from pathlib import Path
import urllib.request
import socket
import getpass
import sys
sys.path.insert(0,'.')
from bh20sequploader.qc_metadata import qc_metadata
from bh20sequploader.qc_fasta import qc_fasta

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__ )
log.debug("Entering sequence uploader")

ARVADOS_API_HOST='lugli.arvadosapi.com'
ARVADOS_API_TOKEN='2fbebpmbo3rw3x05ueu2i6nx70zhrsb1p22ycu3ry34m4x4462'
UPLOAD_PROJECT='lugli-j7d0g-n5clictpuvwk8aa'

def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True):
    try:
        log.debug("Checking metadata" if do_qc else "Skipping metadata check")
        if do_qc and not qc_metadata(metadata.name):
            log.warning("Failed metadata qc")
            exit(1)
    except ValueError as e:
        log.debug(e)
        log.debug("Failed metadata qc")
        print(e)
        exit(1)

    target = []
    try:
        log.debug("FASTA/FASTQ QC" if do_qc else "Limited FASTA/FASTQ QC")
        target.append(qc_fasta(sequence_p1, check_with_clustalw=do_qc))
        if sequence_p2:
            target.append(qc_fasta(sequence_p2))
            target[0] = ("reads_1."+target[0][0][6:], target[0][1])
            target[1] = ("reads_2."+target[1][0][6:], target[0][1])
    except ValueError as e:
        log.debug(e)
        log.debug("Failed FASTA qc")
        print(e)
        exit(1)

    return target

def upload_sequence(col, target, sequence):
    with col.open(target[0], "wb") as f:
        r = sequence.read(65536)
        while r:
            f.write(r)
            r = sequence.read(65536)


def main():
    parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis')
    parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json')
    parser.add_argument('sequence_p1', type=argparse.FileType('rb'), help='sequence FASTA/FASTQ')
    parser.add_argument('sequence_p2', type=argparse.FileType('rb'), default=None, nargs='?', help='sequence FASTQ pair')
    parser.add_argument("--validate", action="store_true", help="Dry run, validate only")
    parser.add_argument("--skip-qc", action="store_true", help="Skip local qc check")
    args = parser.parse_args()

    api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True)

    target = qc_stuff(args.metadata, args.sequence_p1, args.sequence_p2, not args.skip_qc)
    seqlabel = target[0][1]

    if args.validate:
        print("Valid")
        exit(0)

    col = arvados.collection.Collection(api_client=api)

    upload_sequence(col, target[0], args.sequence_p1)
    if args.sequence_p2:
        upload_sequence(col, target[1], args.sequence_p2)

    print("Reading metadata")
    with col.open("metadata.yaml", "w") as f:
        r = args.metadata.read(65536)
        print(r[0:20])
        while r:
            f.write(r)
            r = args.metadata.read(65536)

    external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')

    try:
        username = getpass.getuser()
    except KeyError:
        username = "unknown"

    properties = {
        "sequence_label": seqlabel,
        "upload_app": "bh20-seq-uploader",
        "upload_ip": external_ip,
        "upload_user": "%s@%s" % (username, socket.gethostname())
    }

    col.save_new(owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" %
                 (seqlabel, properties['upload_user'], properties['upload_ip']),
                 properties=properties, ensure_unique_name=True)

    print("Saved to %s" % col.manifest_locator())

    print("Done")

if __name__ == "__main__":
    main()