From 329a1a7e122eda41016185d1b1e8d50d97f8857b Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Thu, 7 Jan 2021 03:17:47 -0600 Subject: Allow for xml and xml.gz files --- workflows/pull-data/genbank/genbank.py | 3 +- .../genbank/transform-genbank-xml2yamlfa.py | 78 ++++++++++++---------- 2 files changed, 43 insertions(+), 38 deletions(-) diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 85d615c..026c03f 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -111,7 +111,8 @@ def get_metadata(id, gbseq): # print(n,file=sys.stderr) if n != 'Unpublished': institute,address = n.split(',',1) - submitter.submitter_name = institute.split(') ')[1] + if ")" in institute: + submitter.submitter_name = institute.split(')')[1] submitter.submitter_address = address.strip() except AttributeError: pass diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index 9414864..1a8035d 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -33,43 +33,47 @@ states = {} for xmlfn in args.files: print(f"--- Reading {xmlfn}") - with gzip.open(xmlfn, 'r') as f: - xml = f.read().decode() - tree = ET.fromstring(xml) - for gb in tree.findall('./GBSeq'): - valid = None - error = None - meta = {} - id = gb.find("GBSeq_locus").text - basename = dir+"/"+id - print(f" parsing {id}") - try: - valid,meta = genbank.get_metadata(id,gb) - if valid: - # --- write JSON - jsonfn = basename + ".json" - with open(jsonfn, 'w') as outfile: - print(f" writing {jsonfn}") - json.dump(meta, outfile, indent=4) - # --- write FASTA - fa = basename+".fa" - seq = genbank.get_sequence(id,gb) - print(f" writing {fa}") - with open(fa,"w") as f2: - f2.write(f"> {id}\n") - f2.write(seq) - # print(seq) - except genbank.GBError as e: - error = f"{e} for {id}" - print(error,file=sys.stderr) - valid = False - state = {} - state['valid'] = valid - if error: - state['error'] = error - if meta['warnings']: - state['warnings'] = meta['warnings'] - states[id] = state + try: + with gzip.open(xmlfn, 'r') as f: + xml = f.read().decode() + except Exception: + with open(xmlfn, 'r') as f: + xml = f.read() + tree = ET.fromstring(xml) + for gb in tree.findall('./GBSeq'): + valid = None + error = None + meta = {} + id = gb.find("GBSeq_locus").text + basename = dir+"/"+id + print(f" parsing {id}") + try: + valid,meta = genbank.get_metadata(id,gb) + if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA + fa = basename+".fa" + seq = genbank.get_sequence(id,gb) + print(f" writing {fa}") + with open(fa,"w") as f2: + f2.write(f"> {id}\n") + f2.write(seq) + # print(seq) + except genbank.GBError as e: + error = f"{e} for {id}" + print(error,file=sys.stderr) + valid = False + state = {} + state['valid'] = valid + if error: + state['error'] = error + if meta['warnings']: + state['warnings'] = meta['warnings'] + states[id] = state statefn = dir + '/state.json' with open(statefn, 'w') as outfile: -- cgit v1.2.3