diff options
Diffstat (limited to 'workflows/pull-data')
-rw-r--r-- | workflows/pull-data/genbank/README.md | 12 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/genbank-fetch-ids.py | 39 | ||||
-rw-r--r-- | workflows/pull-data/genbank/genbank.py | 3 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py | 78 | ||||
-rwxr-xr-x | workflows/pull-data/genbank/update-from-genbank.py | 25 | ||||
-rw-r--r-- | workflows/pull-data/genbank/utils.py | 22 |
6 files changed, 99 insertions, 80 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index 5464d1d..188ff6f 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -11,7 +11,8 @@ The following workflow sends GenBank data into PubSeq ```sh # --- get list of IDs already in PubSeq -../../tools/sparql-fetch-ids > pubseq_ids.txt +../../tools/pubseq-fetch-ids > pubseq_ids.txt + # --- get list of missing genbank IDs python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt @@ -26,6 +27,13 @@ python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --s ``` +## Validate GenBank data + +To pull the data from PubSeq use the list of pubseq ids generated +above. + + + # TODO -- [ ] Add id for GenBank accession - i.e. how can we tell a record is from GenBank +- [X] Add id for GenBank accession - i.e. how can we tell a record is from GenBank diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py index 1962daa..e9e7315 100755 --- a/workflows/pull-data/genbank/genbank-fetch-ids.py +++ b/workflows/pull-data/genbank/genbank-fetch-ids.py @@ -6,28 +6,19 @@ # # See also directory .guix-run and README.md -BATCH_SIZE=5000 - import argparse -import json -import os -import requests import sys -import xml.etree.ElementTree as ET -from datetime import date, datetime -from dateutil.parser import parse + +from Bio import Entrez parser = argparse.ArgumentParser() parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False) args = parser.parse_args() -from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME - -# min_acceptable_collection_date = datetime(2019, 12, 1) +BATCH_SIZE = 5000 -today_date = date.today().strftime("%Y.%m.%d") +Entrez.email = 'another_email@gmail.com' # FIXME skip = set() if args.skip: @@ -36,10 +27,11 @@ if args.skip: for line in content: skip.add(line.strip()) -print(f"Skip size is {len(skip)}",file=sys.stderr) +print(f"Skip size is {len(skip)}", file=sys.stderr) # Try to search several strings TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] + # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP'] @@ -47,22 +39,27 @@ ids = set() for term in TERMS: num_read = BATCH_SIZE retstart = 0 + while num_read == BATCH_SIZE: record = Entrez.read( - Entrez.esearch(db='nuccore', term=term, idtype='acc', - retstart=retstart, retmax=BATCH_SIZE) + Entrez.esearch(db='nuccore', term=term, idtype='acc', retstart=retstart, retmax=BATCH_SIZE) ) + idlist = record['IdList'] new_ids = set(idlist) num_read = len(new_ids) - print(num_read,":",idlist[0],file=sys.stderr) retstart += num_read - new_ids.difference_update(skip) # remove skip ids + + print(num_read, ":", idlist[0], file=sys.stderr) + + new_ids.difference_update(skip) # remove skip ids new_ids = set([id for id in new_ids if id[:2] not in PREFIX]) - ids.update(new_ids) # add to total set - print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr) + ids.update(new_ids) # add to total set + + print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)}", file=sys.stderr) + if args.max and len(ids) > args.max: - print(f"Stopping past #{args.max} items",file=sys.stderr) + print(f"Stopping past #{args.max} items", file=sys.stderr) break for id in ids: diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 85d615c..026c03f 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -111,7 +111,8 @@ def get_metadata(id, gbseq): # print(n,file=sys.stderr) if n != 'Unpublished': institute,address = n.split(',',1) - submitter.submitter_name = institute.split(') ')[1] + if ")" in institute: + submitter.submitter_name = institute.split(')')[1] submitter.submitter_address = address.strip() except AttributeError: pass diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py index 9414864..1a8035d 100755 --- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py +++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py @@ -33,43 +33,47 @@ states = {} for xmlfn in args.files: print(f"--- Reading {xmlfn}") - with gzip.open(xmlfn, 'r') as f: - xml = f.read().decode() - tree = ET.fromstring(xml) - for gb in tree.findall('./GBSeq'): - valid = None - error = None - meta = {} - id = gb.find("GBSeq_locus").text - basename = dir+"/"+id - print(f" parsing {id}") - try: - valid,meta = genbank.get_metadata(id,gb) - if valid: - # --- write JSON - jsonfn = basename + ".json" - with open(jsonfn, 'w') as outfile: - print(f" writing {jsonfn}") - json.dump(meta, outfile, indent=4) - # --- write FASTA - fa = basename+".fa" - seq = genbank.get_sequence(id,gb) - print(f" writing {fa}") - with open(fa,"w") as f2: - f2.write(f"> {id}\n") - f2.write(seq) - # print(seq) - except genbank.GBError as e: - error = f"{e} for {id}" - print(error,file=sys.stderr) - valid = False - state = {} - state['valid'] = valid - if error: - state['error'] = error - if meta['warnings']: - state['warnings'] = meta['warnings'] - states[id] = state + try: + with gzip.open(xmlfn, 'r') as f: + xml = f.read().decode() + except Exception: + with open(xmlfn, 'r') as f: + xml = f.read() + tree = ET.fromstring(xml) + for gb in tree.findall('./GBSeq'): + valid = None + error = None + meta = {} + id = gb.find("GBSeq_locus").text + basename = dir+"/"+id + print(f" parsing {id}") + try: + valid,meta = genbank.get_metadata(id,gb) + if valid: + # --- write JSON + jsonfn = basename + ".json" + with open(jsonfn, 'w') as outfile: + print(f" writing {jsonfn}") + json.dump(meta, outfile, indent=4) + # --- write FASTA + fa = basename+".fa" + seq = genbank.get_sequence(id,gb) + print(f" writing {fa}") + with open(fa,"w") as f2: + f2.write(f"> {id}\n") + f2.write(seq) + # print(seq) + except genbank.GBError as e: + error = f"{e} for {id}" + print(error,file=sys.stderr) + valid = False + state = {} + state['valid'] = valid + if error: + state['error'] = error + if meta['warnings']: + state['warnings'] = meta['warnings'] + states[id] = state statefn = dir + '/state.json' with open(statefn, 'w') as outfile: diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py index dca5563..95f5a93 100755 --- a/workflows/pull-data/genbank/update-from-genbank.py +++ b/workflows/pull-data/genbank/update-from-genbank.py @@ -14,22 +14,21 @@ import sys from utils import chunks from Bio import Entrez -Entrez.email = 'another_email@gmail.com' # FIXME -BATCH=100 +Entrez.email = 'another_email@gmail.com' # FIXME + +BATCH = 100 parser = argparse.ArgumentParser() -parser.add_argument('--max', type=int, help='Max queries', required=False) parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) parser.add_argument('--out', type=str, help='Directory to write to', required=True) +parser.add_argument('--max', type=int, help='Max queries', required=False) args = parser.parse_args() ids = set() with open(args.ids) as f: - content = f.readlines() - for line in content: - ids.add(line.strip()) + ids.update([line.strip() for line in f]) dir = args.out if not os.path.exists(dir): @@ -37,12 +36,14 @@ if not os.path.exists(dir): request_num = BATCH if args.max: - request_num = min(BATCH,args.max) + request_num = min(BATCH, args.max) + +for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)): + xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz") + print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr) -for i, idsx in enumerate(chunks(list(ids), request_num)): - xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") - print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) with gzip.open(xmlfn, 'w') as f: - f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) - if args.max and i*request_num >= args.max: + f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode()) + + if args.max and num_chunk * request_num >= args.max: break diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py index 3efc67a..96920a5 100644 --- a/workflows/pull-data/genbank/utils.py +++ b/workflows/pull-data/genbank/utils.py @@ -1,5 +1,6 @@ import os + def is_integer(string_to_check): try: int(string_to_check) @@ -7,19 +8,26 @@ def is_integer(string_to_check): except ValueError: return False + def chunks(lst, n): for i in range(0, len(lst), n): yield lst[i:i + n] + def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): - # Check duplicated entry looking at all dictionaries + """ + Check duplicated entry by looking in all dictionaries + """ + field_to_term_to_uri_dict = {} - path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in - os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + path_dict_xxx_csv_list = [ + os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv') + ] for path_dict_xxx_csv in path_dict_xxx_csv_list: - print('Read {}'.format(path_dict_xxx_csv)) + print(f'Read {path_dict_xxx_csv}') with open(path_dict_xxx_csv) as f: for line in f: @@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict: - print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + print(f'Warning: in the dictionaries there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[term] = uri @@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): term = term.strip('"') if term in field_to_term_to_uri_dict[field]: - print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).') continue field_to_term_to_uri_dict[field][term] = uri - return field_to_term_to_uri_dict
\ No newline at end of file + return field_to_term_to_uri_dict |