aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
blob: 3b9813c247bcb8df1fa4cf310ae6ab9a77341d82 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
#
# Create a single YAML/FASTA for each genbank entry in GenBank XML file
#
#   transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
# ----------------------------------------------------------------------

# See also directory .guix-run and README.md

import argparse
import gzip
import json
import os
import sys
import types
import xml.etree.ElementTree as ET
from utils import chunks
import genbank

parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('files', nargs='+', help='file(s)')
args = parser.parse_args()

dir = args.out
if not os.path.exists(dir):
    raise Exception(f"Directory {dir} does not exist")

states = {}

for xmlfn in args.files:
    print(f"--- Reading {xmlfn}")
    try:
        with gzip.open(xmlfn, 'r') as f:
            xml = f.read().decode()
    except Exception:
        with open(xmlfn, 'r') as f:
            xml = f.read()
    tree = ET.fromstring(xml)
    for gb in tree.findall('./GBSeq'):
        valid = None
        error = None
        meta = {}
        id = gb.find("GBSeq_accession-version").text
        basename = dir+"/"+id
        print(f"    parsing {id}")
        try:
            valid,meta = genbank.get_metadata(id,gb)
            if valid:
                # --- write JSON
                jsonfn = basename + ".json"
                with open(jsonfn, 'w') as outfile:
                    print(f"    writing {jsonfn}")
                    json.dump(meta, outfile, indent=2)
                # --- write FASTA
                fa = basename+".fa"
                seq = genbank.get_sequence(id,gb)
                print(f"    writing {fa}")
                with open(fa,"w") as f2:
                    f2.write(f">{id}\n")
                    f2.write(seq)
                # print(seq)
        except genbank.GBError as e:
            error = f"{e} for {id}"
            print(error,file=sys.stderr)
            valid = False
        state = {}
        state['valid'] = valid
        if error:
            state['error'] = error
        if meta['warnings']:
            state['warnings'] = meta['warnings']
        states[id] = state

statefn = dir + '/state.json'
with open(statefn, 'w') as outfile:
    print(f"    Writing {statefn}")
    json.dump(states, outfile, indent=4)