about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xworkflows/pull-data/genbank/genbank-fetch-ids.py1
-rw-r--r--workflows/pull-data/genbank/genbank.py3
-rwxr-xr-xworkflows/pull-data/genbank/transform-genbank-xml2yamlfa.py78
-rwxr-xr-xworkflows/pull-data/genbank/update-from-genbank.py25
-rw-r--r--workflows/pull-data/genbank/utils.py22
-rwxr-xr-xworkflows/tools/pubseq-fetch-data.py33
6 files changed, 90 insertions, 72 deletions
diff --git a/workflows/pull-data/genbank/genbank-fetch-ids.py b/workflows/pull-data/genbank/genbank-fetch-ids.py
index cb48cd8..e9e7315 100755
--- a/workflows/pull-data/genbank/genbank-fetch-ids.py
+++ b/workflows/pull-data/genbank/genbank-fetch-ids.py
@@ -8,7 +8,6 @@
 
 import argparse
 import sys
-from datetime import date
 
 from Bio import Entrez
 
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 85d615c..026c03f 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -111,7 +111,8 @@ def get_metadata(id, gbseq):
         # print(n,file=sys.stderr)
         if n != 'Unpublished':
             institute,address = n.split(',',1)
-            submitter.submitter_name = institute.split(') ')[1]
+            if ")" in institute:
+                submitter.submitter_name = institute.split(')')[1]
             submitter.submitter_address = address.strip()
     except AttributeError:
         pass
diff --git a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
index 9414864..1a8035d 100755
--- a/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
+++ b/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py
@@ -33,43 +33,47 @@ states = {}
 
 for xmlfn in args.files:
     print(f"--- Reading {xmlfn}")
-    with gzip.open(xmlfn, 'r') as f:
-        xml = f.read().decode()
-        tree = ET.fromstring(xml)
-        for gb in tree.findall('./GBSeq'):
-            valid = None
-            error = None
-            meta = {}
-            id = gb.find("GBSeq_locus").text
-            basename = dir+"/"+id
-            print(f"    parsing {id}")
-            try:
-                valid,meta = genbank.get_metadata(id,gb)
-                if valid:
-                    # --- write JSON
-                    jsonfn = basename + ".json"
-                    with open(jsonfn, 'w') as outfile:
-                        print(f"    writing {jsonfn}")
-                        json.dump(meta, outfile, indent=4)
-                    # --- write FASTA
-                    fa = basename+".fa"
-                    seq = genbank.get_sequence(id,gb)
-                    print(f"    writing {fa}")
-                    with open(fa,"w") as f2:
-                        f2.write(f"> {id}\n")
-                        f2.write(seq)
-                    # print(seq)
-            except genbank.GBError as e:
-                error = f"{e} for {id}"
-                print(error,file=sys.stderr)
-                valid = False
-            state = {}
-            state['valid'] = valid
-            if error:
-                state['error'] = error
-            if meta['warnings']:
-                state['warnings'] = meta['warnings']
-            states[id] = state
+    try:
+        with gzip.open(xmlfn, 'r') as f:
+            xml = f.read().decode()
+    except Exception:
+        with open(xmlfn, 'r') as f:
+            xml = f.read()
+    tree = ET.fromstring(xml)
+    for gb in tree.findall('./GBSeq'):
+        valid = None
+        error = None
+        meta = {}
+        id = gb.find("GBSeq_locus").text
+        basename = dir+"/"+id
+        print(f"    parsing {id}")
+        try:
+            valid,meta = genbank.get_metadata(id,gb)
+            if valid:
+                # --- write JSON
+                jsonfn = basename + ".json"
+                with open(jsonfn, 'w') as outfile:
+                    print(f"    writing {jsonfn}")
+                    json.dump(meta, outfile, indent=4)
+                # --- write FASTA
+                fa = basename+".fa"
+                seq = genbank.get_sequence(id,gb)
+                print(f"    writing {fa}")
+                with open(fa,"w") as f2:
+                    f2.write(f"> {id}\n")
+                    f2.write(seq)
+                # print(seq)
+        except genbank.GBError as e:
+            error = f"{e} for {id}"
+            print(error,file=sys.stderr)
+            valid = False
+        state = {}
+        state['valid'] = valid
+        if error:
+            state['error'] = error
+        if meta['warnings']:
+            state['warnings'] = meta['warnings']
+        states[id] = state
 
 statefn = dir + '/state.json'
 with open(statefn, 'w') as outfile:
diff --git a/workflows/pull-data/genbank/update-from-genbank.py b/workflows/pull-data/genbank/update-from-genbank.py
index dca5563..95f5a93 100755
--- a/workflows/pull-data/genbank/update-from-genbank.py
+++ b/workflows/pull-data/genbank/update-from-genbank.py
@@ -14,22 +14,21 @@ import sys
 from utils import chunks
 
 from Bio import Entrez
-Entrez.email = 'another_email@gmail.com' # FIXME
 
-BATCH=100
+Entrez.email = 'another_email@gmail.com'  # FIXME
+
+BATCH = 100
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--max', type=int, help='Max queries', required=False)
 parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
 parser.add_argument('--out', type=str, help='Directory to write to', required=True)
+parser.add_argument('--max', type=int, help='Max queries', required=False)
 
 args = parser.parse_args()
 
 ids = set()
 with open(args.ids) as f:
-    content = f.readlines()
-    for line in content:
-        ids.add(line.strip())
+    ids.update([line.strip() for line in f])
 
 dir = args.out
 if not os.path.exists(dir):
@@ -37,12 +36,14 @@ if not os.path.exists(dir):
 
 request_num = BATCH
 if args.max:
-  request_num = min(BATCH,args.max)
+    request_num = min(BATCH, args.max)
+
+for num_chunk, ids_chunk in enumerate(chunks(list(ids), request_num)):
+    xmlfn = os.path.join(dir, f"metadata_{num_chunk}.xml.gz")
+    print(f"Fetching {xmlfn} ({num_chunk * request_num})", file=sys.stderr)
 
-for i, idsx in enumerate(chunks(list(ids), request_num)):
-    xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
-    print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
     with gzip.open(xmlfn, 'w') as f:
-        f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
-    if args.max and i*request_num >= args.max:
+        f.write(Entrez.efetch(db='nuccore', id=ids_chunk, retmode='xml').read().encode())
+
+    if args.max and num_chunk * request_num >= args.max:
         break
diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py
index 3efc67a..96920a5 100644
--- a/workflows/pull-data/genbank/utils.py
+++ b/workflows/pull-data/genbank/utils.py
@@ -1,5 +1,6 @@
 import os
 
+
 def is_integer(string_to_check):
     try:
         int(string_to_check)
@@ -7,19 +8,26 @@ def is_integer(string_to_check):
     except ValueError:
         return False
 
+
 def chunks(lst, n):
     for i in range(0, len(lst), n):
         yield lst[i:i + n]
 
+
 def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
-    # Check duplicated entry looking at all dictionaries
+    """
+    Check duplicated entry by looking in all dictionaries
+    """
+
     field_to_term_to_uri_dict = {}
 
-    path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
-                              os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+    path_dict_xxx_csv_list = [
+        os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+        os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
+    ]
 
     for path_dict_xxx_csv in path_dict_xxx_csv_list:
-        print('Read {}'.format(path_dict_xxx_csv))
+        print(f'Read {path_dict_xxx_csv}')
 
         with open(path_dict_xxx_csv) as f:
             for line in f:
@@ -31,7 +39,7 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
                 term = term.strip('"')
 
                 if term in field_to_term_to_uri_dict:
-                    print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                    print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
                     continue
 
                 field_to_term_to_uri_dict[term] = uri
@@ -54,9 +62,9 @@ def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
                 term = term.strip('"')
 
                 if term in field_to_term_to_uri_dict[field]:
-                    print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+                    print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
                     continue
 
                 field_to_term_to_uri_dict[field][term] = uri
 
-    return field_to_term_to_uri_dict
\ No newline at end of file
+    return field_to_term_to_uri_dict
diff --git a/workflows/tools/pubseq-fetch-data.py b/workflows/tools/pubseq-fetch-data.py
index 2119fdf..ef4edde 100755
--- a/workflows/tools/pubseq-fetch-data.py
+++ b/workflows/tools/pubseq-fetch-data.py
@@ -5,6 +5,7 @@ import json
 import os
 import requests
 import sys
+import time
 
 parser = argparse.ArgumentParser(description="""
 
@@ -33,18 +34,22 @@ for id in ids:
     print(id)
     jsonfn = dir+"/"+id+".json"
     if not os.path.exists(jsonfn):
+        count = 0
         r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
-        if r:
-            m_url = r.json()[0]['metadata']
-            mr = requests.get(m_url)
-            with open(dir+"/"+id+".json","w") as outf:
-                outf.write(mr.text)
-            if args.fasta:
-                fastafn = dir+"/"+id+".fa"
-                if os.path.exists(fastafn): continue
-                fa_url = r.json()[0]['fasta']
-                fr = requests.get(fa_url)
-                with open(fastafn,"w") as outf:
-                    outf.write(fr.text)
-        else:
-            raise Exception(f"Can not find record for {id}")
+        while not r:
+            count += 1
+            if count>10: raise Exception(f"Can not find record for {id}")
+            time.sleep(15)
+            r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
+        m_url = r.json()[0]['metadata']
+        mr = requests.get(m_url)
+        with open(dir+"/"+id+".json","w") as outf:
+            outf.write(mr.text)
+        if args.fasta:
+            fastafn = dir+"/"+id+".fa"
+            if os.path.exists(fastafn): continue
+            fa_url = r.json()[0]['fasta']
+            fr = requests.get(fa_url)
+            with open(fastafn,"w") as outf:
+                outf.write(fr.text)
+