aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
blob: 9414864662be490eabd908dcf51c4ba5abf73d9c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
#
# Create a single YAML/FASTA for each genbank entry in GenBank XML file
#
#   transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
# ----------------------------------------------------------------------

# See also directory .guix-run and README.md

import argparse
import gzip
import json
import os
import sys
import types
import xml.etree.ElementTree as ET
from utils import chunks
import genbank

parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('files', nargs='+', help='file(s)')
args = parser.parse_args()

dir = args.out
if not os.path.exists(dir):
    raise Exception(f"Directory {dir} does not exist")

states = {}

for xmlfn in args.files:
    print(f"--- Reading {xmlfn}")
    with gzip.open(xmlfn, 'r') as f:
        xml = f.read().decode()
        tree = ET.fromstring(xml)
        for gb in tree.findall('./GBSeq'):
            valid = None
            error = None
            meta = {}
            id = gb.find("GBSeq_locus").text
            basename = dir+"/"+id
            print(f"    parsing {id}")
            try:
                valid,meta = genbank.get_metadata(id,gb)
                if valid:
                    # --- write JSON
                    jsonfn = basename + ".json"
                    with open(jsonfn, 'w') as outfile:
                        print(f"    writing {jsonfn}")
                        json.dump(meta, outfile, indent=4)
                    # --- write FASTA
                    fa = basename+".fa"
                    seq = genbank.get_sequence(id,gb)
                    print(f"    writing {fa}")
                    with open(fa,"w") as f2:
                        f2.write(f"> {id}\n")
                        f2.write(seq)
                    # print(seq)
            except genbank.GBError as e:
                error = f"{e} for {id}"
                print(error,file=sys.stderr)
                valid = False
            state = {}
            state['valid'] = valid
            if error:
                state['error'] = error
            if meta['warnings']:
                state['warnings'] = meta['warnings']
            states[id] = state

statefn = dir + '/state.json'
with open(statefn, 'w') as outfile:
    print(f"    Writing {statefn}")
    json.dump(states, outfile, indent=4)