1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
#!/usr/bin/env python3
#
# Create a single YAML/FASTA for each genbank entry in GenBank XML file
#
# transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
# ----------------------------------------------------------------------
# See also directory .guix-run and README.md
import argparse
import gzip
import json
import os
import sys
import types
import xml.etree.ElementTree as ET
from utils import chunks
import genbank
parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('files', nargs='+', help='file(s)')
args = parser.parse_args()
dir = args.out
if not os.path.exists(dir):
raise Exception(f"Directory {dir} does not exist")
states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
try:
with gzip.open(xmlfn, 'r') as f:
xml = f.read().decode()
except Exception:
with open(xmlfn, 'r') as f:
xml = f.read()
tree = ET.fromstring(xml)
for gb in tree.findall('./GBSeq'):
valid = None
error = None
meta = {}
id = gb.find("GBSeq_accession-version").text
basename = dir+"/"+id
print(f" parsing {id}")
try:
valid,meta = genbank.get_metadata(id,gb)
if valid:
# --- write JSON
jsonfn = basename + ".json"
with open(jsonfn, 'w') as outfile:
print(f" writing {jsonfn}")
json.dump(meta, outfile, indent=2)
# --- write FASTA
fa = basename+".fa"
seq = genbank.get_sequence(id,gb)
print(f" writing {fa}")
with open(fa,"w") as f2:
f2.write(f">{id}\n")
f2.write(seq)
# print(seq)
except genbank.GBError as e:
error = f"{e} for {id}"
print(error,file=sys.stderr)
valid = False
state = {}
state['valid'] = valid
if error:
state['error'] = error
if meta['warnings']:
state['warnings'] = meta['warnings']
states[id] = state
statefn = dir + '/state.json'
with open(statefn, 'w') as outfile:
print(f" Writing {statefn}")
json.dump(states, outfile, indent=4)
|