aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2021-01-07 03:17:47 -0600
committerPjotr Prins2021-01-07 03:17:59 -0600
commit329a1a7e122eda41016185d1b1e8d50d97f8857b (patch)
tree8dbc60b37311beea40730c6f70d4c738a1e919dd
parent27a2b926036211469eccbf8c3d9580182482bdc2 (diff)
downloadbh20-seq-resource-329a1a7e122eda41016185d1b1e8d50d97f8857b.tar.gz
bh20-seq-resource-329a1a7e122eda41016185d1b1e8d50d97f8857b.tar.lz
bh20-seq-resource-329a1a7e122eda41016185d1b1e8d50d97f8857b.zip
Allow for xml and xml.gz files
-rw-r--r--workflows/pull-data/genbank/genbank.py3
-rwxr-xr-xworkflows/pull-data/genbank/transform-genbank-xml2yamlfa.py78
2 files changed, 43 insertions, 38 deletions
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 85d615c..026c03f 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -111,7 +111,8 @@ def get_metadata(id, gbseq):
# print(n,file=sys.stderr)
if n != 'Unpublished':
institute,address = n.split(',',1)
- submitter.submitter_name = institute.split(') ')[1]
+ if ")" in institute:
+ submitter.submitter_name = institute.split(')')[1]
submitter.submitter_address = address.strip()
except AttributeError:
pass
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index 9414864..1a8035d 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -33,43 +33,47 @@ states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
- with gzip.open(xmlfn, 'r') as f:
- xml = f.read().decode()
- tree = ET.fromstring(xml)
- for gb in tree.findall('./GBSeq'):
- valid = None
- error = None
- meta = {}
- id = gb.find("GBSeq_locus").text
- basename = dir+"/"+id
- print(f" parsing {id}")
- try:
- valid,meta = genbank.get_metadata(id,gb)
- if valid:
- # --- write JSON
- jsonfn = basename + ".json"
- with open(jsonfn, 'w') as outfile:
- print(f" writing {jsonfn}")
- json.dump(meta, outfile, indent=4)
- # --- write FASTA
- fa = basename+".fa"
- seq = genbank.get_sequence(id,gb)
- print(f" writing {fa}")
- with open(fa,"w") as f2:
- f2.write(f"> {id}\n")
- f2.write(seq)
- # print(seq)
- except genbank.GBError as e:
- error = f"{e} for {id}"
- print(error,file=sys.stderr)
- valid = False
- state = {}
- state['valid'] = valid
- if error:
- state['error'] = error
- if meta['warnings']:
- state['warnings'] = meta['warnings']
- states[id] = state
+ try:
+ with gzip.open(xmlfn, 'r') as f:
+ xml = f.read().decode()
+ except Exception:
+ with open(xmlfn, 'r') as f:
+ xml = f.read()
+ tree = ET.fromstring(xml)
+ for gb in tree.findall('./GBSeq'):
+ valid = None
+ error = None
+ meta = {}
+ id = gb.find("GBSeq_locus").text
+ basename = dir+"/"+id
+ print(f" parsing {id}")
+ try:
+ valid,meta = genbank.get_metadata(id,gb)
+ if valid:
+ # --- write JSON
+ jsonfn = basename + ".json"
+ with open(jsonfn, 'w') as outfile:
+ print(f" writing {jsonfn}")
+ json.dump(meta, outfile, indent=4)
+ # --- write FASTA
+ fa = basename+".fa"
+ seq = genbank.get_sequence(id,gb)
+ print(f" writing {fa}")
+ with open(fa,"w") as f2:
+ f2.write(f"> {id}\n")
+ f2.write(seq)
+ # print(seq)
+ except genbank.GBError as e:
+ error = f"{e} for {id}"
+ print(error,file=sys.stderr)
+ valid = False
+ state = {}
+ state['valid'] = valid
+ if error:
+ state['error'] = error
+ if meta['warnings']:
+ state['warnings'] = meta['warnings']
+ states[id] = state
statefn = dir + '/state.json'
with open(statefn, 'w') as outfile: