blob: ebdf17ef8ec0a2b8427e059666365bdf03b19e7d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
#!/usr/bin/env python3
#
# Create a single YAML/FASTA from genbank XML
#
# transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
#
# Where --in can be a file or a directory
# ----------------------------------------------------------------------
# See also directory .guix-run and README.md
import argparse
import gzip
import os
import sys
import types
import xml.etree.ElementTree as ET
from utils import chunks
import genbank
parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('files', nargs='+', help='file(s)')
args = parser.parse_args()
dir = args.out
if not os.path.exists(dir):
raise Exception(f"Directory {dir} does not exist")
states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
with gzip.open(xmlfn, 'r') as f:
xml = f.read().decode()
tree = ET.fromstring(xml)
for gb in tree.findall('./GBSeq'):
valid = None
error = None
meta = {}
id = gb.find("GBSeq_locus").text
basename = dir+"/"+id
print(f" parsing {id}")
try:
valid,meta = genbank.get_metadata(id,gb)
if valid:
fa = basename+".fa"
seq = genbank.get_sequence(id,gb)
print(f" writing {fa}")
with open(fa,"w") as f2:
f2.write(f"> {id}\n")
f2.write(seq)
# print(seq)
except genbank.GBError as e:
error = f"{e} for {id}"
print(error,file=sys.stderr)
valid = False
state = {}
state['valid'] = valid
if error:
state['error'] = error
if meta['warnings']:
state['warnings'] = meta['warnings']
states[id] = state
print(states)
|