aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/pull-data/genbank')
-rwxr-xr-xworkflows/pull-data/genbank/genbank-fetch-ids.py1
-rw-r--r--workflows/pull-data/genbank/genbank.py3
-rwxr-xr-xworkflows/pull-data/genbank/transform-genbank-xml2yamlfa.py78
-rwxr-xr-xworkflows/pull-data/genbank/update-from-genbank.py25
-rw-r--r--workflows/pull-data/genbank/utils.py22
5 files changed, 71 insertions, 58 deletions
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index cb48cd8..e9e7315 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -8,7 +8,6 @@
import argparse
import sys
-from datetime import date
from Bio import Entrez
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 85d615c..026c03f 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -111,7 +111,8 @@ def get_metadata(id, gbseq):
# print(n,file=sys.stderr)
if n != 'Unpublished':
institute,address = n.split(',',1)
- submitter.submitter_name = institute.split(') ')[1]
+ if ")" in institute:
+ submitter.submitter_name = institute.split(')')[1]
submitter.submitter_address = address.strip()
except AttributeError:
pass
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index 9414864..1a8035d 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -33,43 +33,47 @@ states = {}
for xmlfn in args.files:
print(f"--- Reading {xmlfn}")
- with gzip.open(xmlfn, 'r') as f:
- xml = f.read().decode()
- tree = ET.fromstring(xml)
- for gb in tree.findall('./GBSeq'):
- valid = None
- error = None
- meta = {}
- id = gb.find("GBSeq_locus").text
- basename = dir+"/"+id
- print(f" parsing {id}")
- try:
- valid,meta = genbank.get_metadata(id,gb)
- if valid:
- # --- write JSON
- jsonfn = basename + ".json"
- with open(jsonfn, 'w') as outfile:
- print(f" writing {jsonfn}")
- json.dump(meta, outfile, indent=4)
- # --- write FASTA
- fa = basename+".fa"
- seq = genbank.get_sequence(id,gb)
- print(f" writing {fa}")
- with open(fa,"w") as f2:
- f2.write(f"> {id}\n")
- f2.write(seq)
- # print(seq)
- except genbank.GBError as e:
- error = f"{e} for {id}"
- print(error,file=sys.stderr)
- valid = False
- state = {}
- state['valid'] = valid
- if error:
- state['error'] = error
- if meta['warnings']:
- state['warnings'] = meta['warnings']
- states[id] = state
+ try:
+ with gzip.open(xmlfn, 'r') as f:
+ xml = f.read().decode()
+ except Exception:
+ with open(xmlfn, 'r') as f:
+ xml = f.read()
+ tree = ET.fromstring(xml)
+ for gb in tree.findall('./GBSeq'):
+ valid = None
+ error = None
+ meta = {}
+ id = gb.find("GBSeq_locus").text
+ basename = dir+"/"+id
+ print(f" parsing {id}")
+ try:
+ valid,meta = genbank.get_metadata(id,gb)
+ if valid:
+ # --- write JSON
+ jsonfn = basename + ".json"
+ with open(jsonfn, 'w') as outfile:
+ print(f" writing {jsonfn}")
+ json.dump(meta, outfile, indent=4)
+ # --- write FASTA
+ fa = basename+".fa"
+ seq = genbank.get_sequence(id,gb)
+ print(f" writing {fa}")
+ with open(fa,"w") as f2:
+ f2.write(f"> {id}\n")
+ f2.write(seq)
+ # print(seq)
+ except genbank.GBError as e:
+ error = f"{e} for {id}"
+ print(error,file=sys.stderr)
+ valid = False
+ state = {}
+ state['valid'] = valid
+ if error:
+ state['error'] = error
+ if meta['warnings']:
+ state['warnings'] = meta['warnings']
+ states[id] = state
statefn = dir + '/state.json'
with open(statefn, 'w') as outfile:
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index dca5563..95f5a93 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -14,22 +14,21 @@ import sys
from utils import chunks
from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
-BATCH=100
+Entrez.email = 'another_email@gmail.com' # FIXME
+
+BATCH = 100
parser = argparse.ArgumentParser()
-parser.add_argument('--max', type=int, help='Max queries', required=False)
parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
parser.add_argument('--out', type=str, help='Directory to write to', required=True)
+parser.add_argument('--max', type=int, help='Max queries', required=False)
args = parser.parse_args()
ids = set()
with open(args.ids) as f:
- content = f.readlines()
- for line in content:
- ids.add(line.strip())
+ ids.update([line.strip() for line in f])
dir = args.out
if not os.path.exists(dir):
@@ -37,12 +36,14 @@ if not os.path.exists(dir):
request_num = BATCH
if args.max:
- request_num = min(BATCH,args.max)
+ request_num = min(BATCH, args.max)
+
+for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
+ xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
+ print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)
-for i, idsx in enumerate(chunks(list(ids), request_num)):
- xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
- print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
with gzip.open(xmlfn, 'w') as f:
- f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
- if args.max and i*request_num >= args.max:
+ f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())
+
+ if args.max and num_chunk * request_num >= args.max:
break
diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py
index 3efc67a..96920a5 100644
--- a/workflows/pull-data/genbank/utils.py
+++ b/workflows/pull-data/genbank/utils.py
@@ -1,5 +1,6 @@
import os
+
def is_integer(string_to_check):
try:
int(string_to_check)
@@ -7,19 +8,26 @@ def is_integer(string_to_check):
except ValueError:
return False
+
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
+
def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
- # Check duplicated entry looking at all dictionaries
+ """
+ Check duplicated entry by looking in all dictionaries
+ """
+
field_to_term_to_uri_dict = {}
- path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
- os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+ path_dict_xxx_csv_list = [
+ os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+ os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
+ ]
for path_dict_xxx_csv in path_dict_xxx_csv_list:
- print('Read {}'.format(path_dict_xxx_csv))
+ print(f'Read {path_dict_xxx_csv}')
with open(path_dict_xxx_csv) as f:
for line in f:
@@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
term = term.strip('"')
if term in field_to_term_to_uri_dict:
- print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+ print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
continue
field_to_term_to_uri_dict[term] = uri
@@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
term = term.strip('"')
if term in field_to_term_to_uri_dict[field]:
- print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+ print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
continue
field_to_term_to_uri_dict[field][term] = uri
- return field_to_term_to_uri_dict \ No newline at end of file
+ return field_to_term_to_uri_dict