blob: c4e3eba0b3bb02e7e7465e191cc1cccc03c24b3e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
#!/usr/bin/env python3
#
# Create a single YAML/FASTA from genbank XML
#
# transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
#
# Also writes a validation file in the outdir named state.json
#
# Where --in can be a file or a directory
# ----------------------------------------------------------------------
# See also directory .guix-run and README.md
import argparse
import gzip
import os
import sys
import types
import xml.etree.ElementTree as ET
from utils import chunks
import genbank
parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('files', nargs='+', help='file(s)')
args = parser.parse_args()
dir = args.out
if not os.path.exists(dir):
raise Exception(f"Directory {dir} does not exist")
states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
with gzip.open(xmlfn, 'r') as f:
xml = f.read().decode()
tree = ET.fromstring(xml)
for gb in tree.findall('./GBSeq'):
valid = None
error = None
id = gb.find("GBSeq_locus").text
basename = dir+"/"+id
print(f" parsing {id}")
try:
valid,meta = genbank.get_metadata(id,gb)
if valid:
fa = basename+".fa"
seq = genbank.get_sequence(id,gb)
print(f" writing {fa}")
with open(fa,"w") as f2:
f2.write(f"> {id}\n")
f2.write(seq)
# print(seq)
except genbank.GBError as e:
print(f"OS error: {e}")
valid = False
error = str(e)
state = {}
if not valid:
state['valid'] = False
if error:
state['error'] = error
states[id] = state
print(states)
|