aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools/pubseq-fetch-data.py
blob: 2119fdf8975ee6fa8e12c0bec6010e90944f7e20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3

import argparse
import json
import os
import requests
import sys

parser = argparse.ArgumentParser(description="""

Fetch metadata (JSON) from PubSeq and optionally the FASTA files.  IDs
can be passed in on the command line or in a file.

""")
parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
parser.add_argument('--out', type=str, help='Directory to write to',
required=True)
parser.add_argument('--ids', type=str, help='File with ids', required=False)
parser.add_argument('id', nargs='*', help='id(s)')
args = parser.parse_args()

dir = args.out
if not os.path.exists(dir):
    raise Exception(f"Directory {dir} does not exist")

ids = args.id
if (len(ids)==0):
    print(f"Reading {args.ids}")
    with open(args.ids) as f:
        ids = [ l.strip() for l in f.readlines() ]

for id in ids:
    print(id)
    jsonfn = dir+"/"+id+".json"
    if not os.path.exists(jsonfn):
        r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
        if r:
            m_url = r.json()[0]['metadata']
            mr = requests.get(m_url)
            with open(dir+"/"+id+".json","w") as outf:
                outf.write(mr.text)
            if args.fasta:
                fastafn = dir+"/"+id+".fa"
                if os.path.exists(fastafn): continue
                fa_url = r.json()[0]['fasta']
                fr = requests.get(fa_url)
                with open(fastafn,"w") as outf:
                    outf.write(fr.text)
        else:
            raise Exception(f"Can not find record for {id}")