workflows/pull-data/genbank/update-from-genbank.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

#!/usr/bin/env python3
#
# bulk download genbank data and matadata, preparing the FASTA and the
# YAML files
#
#   update-from-genbank.py --max 10 --ids ids.txt --out ~/tmp/genbank-xml
#
# See also directory .guix-run and README.md

import argparse
import gzip
import os
import sys
from utils import chunks

from Bio import Entrez

Entrez.email = 'another_email@gmail.com'  # FIXME

BATCH = 100

parser = argparse.ArgumentParser()
parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
parser.add_argument('--out', type=str, help='Directory to write to', required=True)
parser.add_argument('--max', type=int, help='Max queries', required=False)

args = parser.parse_args()

ids = set()
with open(args.ids) as f:
    ids.update([line.strip() for line in f])

dir = args.out
if not os.path.exists(dir):
    raise Exception(f"Directory {dir} does not exist")

request_num = BATCH
if args.max:
    request_num = min(BATCH, args.max)

for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
    xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
    print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)

    with gzip.open(xmlfn, 'w') as f:
        f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())

    if args.max and num_chunk * request_num >= args.max:
        break