about summary refs log tree commit diff
path: root/workflows/pull-data
diff options
context:
space:
mode:
Diffstat (limited to 'workflows/pull-data')
-rw-r--r--workflows/pull-data/genbank/genbank.py48
-rw-r--r--workflows/pull-data/genbank/ref.py48
2 files changed, 46 insertions, 50 deletions
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 314f50d..90f5a14 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -70,6 +70,13 @@ def get_metadata(id, gbseq):
         print(f"WARNING: {msg}",file=sys.stderr)
         warnings.append(msg)
 
+    def fetch(msg, xpath):
+        try:
+            n = gbseq.find(xpath).text
+            return n
+        except AttributeError:
+            warn("Missing "+msg)
+
     host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
     sample.sample_id = id
     sample.database = "https://www.ncbi.nlm.nih.gov/genbank/"
@@ -132,14 +139,47 @@ def get_metadata(id, gbseq):
         warn("Missing collection_date")
         sample.collection_date = None
 
+    # --- Host info
+    # - Homo sapiens
+    # - Homo sapiens; female
+    # - Homo sapiens; female 63
+    # - Homo sapiens; female; age 40
+    # - Homo sapiens; gender: F; age: 61
+    # - Homo sapiens; gender: M; age: 68
+    # - Homo sapiens; hospitalized patient
+    # - Homo sapiens; male
+    # - Homo sapiens; male; 63
+    # - Homo sapiens; male; age 29
+    # - Homo sapiens; symptomatic
+    n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value")
+    if n:
+        list = n.split('; ')
+        species = list[0]
+        host.host_species = species
+        if species != "Homo sapiens":
+            warn(f"Species not understood: {species}")
+        if len(list)>1:
+            sex = list[1]
+            if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male'
+            if 'female' in sex or 'gender: F' in sex: host.host_sex = 'female'
+        if len(list)>2:
+            age = list[2]
+            p = re.compile(r'[^\d]+(\d+)')
+            m = p.match(n)
+            print(m.group(1))
+            if m:
+                host.host_age = int(m.group(1))
+                host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036'
+        # sys.exit(1)
+
     info = {
         'id': 'placeholder',
         'update_date': str(update_date),
-        'host': host,
-        'sample': sample,
+        'host': host.__dict__,
+        'sample': sample.__dict__,
         #'virus': virus,
-        'technology': technology,
-        'submitter': submitter,
+        'technology': technology.__dict__,
+        'submitter': submitter.__dict__,
         'warnings': warnings,
         }
     print(info)
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index d809d7f..a9b4f06 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -26,53 +26,9 @@
                             technology[field_in_yaml] = tech_info_to_parse
 
 
-            for GBFeature in GBSeq.iter('GBFeature'):
-                if GBFeature.find('GBFeature_key').text != 'source':
-                    continue
-
-                for GBQualifier in GBFeature.iter('GBQualifier'):
-                    GBQualifier_value = GBQualifier.find('GBQualifier_value')
-                    if GBQualifier_value is None:
-                        continue
-                    GBQualifier_value_text = GBQualifier_value.text
-
-                    GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
-
-                    if GBQualifier_name_text == 'host':
-                        GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female
-                        if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_host_species']:
-                            # Cases like 'Felis catus; Domestic Shorthair'
-                            host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text]
-                        else:
-                            GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
 
-                            if GBQualifier_value_text_list[0] in field_to_term_to_uri_dict['ncbi_host_species']:
-                                host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text_list[0]]
-                            elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version):
-                                # Information checked manually from NCBI Virus
-                                host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species']['Canis lupus familiaris']
-                            else:
-                                missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
-
-                            # Possible cases:
-                            # - Homo sapiens                        --> ['Homo sapiens']
-                            # - Homo sapiens; female                --> ['Homo sapiens', 'female']
-                            # - Homo sapiens; female 63             --> ['Homo sapiens', 'female 63']
-                            # - Homo sapiens; female; age 40        --> ['Homo sapiens', 'female', 'age 40']
-                            # - Homo sapiens; gender: F; age: 61    --> ['Homo sapiens', 'gender: F', 'age: 61']
-                            # - Homo sapiens; gender: M; age: 68    --> ['Homo sapiens', 'gender: M', 'age: 68']
-                            # - Homo sapiens; hospitalized patient  --> ['Homo sapiens', 'hospitalized patient']
-                            # - Homo sapiens; male                  --> ['Homo sapiens', 'male']
-                            # - Homo sapiens; male; 63              --> ['Homo sapiens', 'male', '63']
-                            # - Homo sapiens; male; age 29          --> ['Homo sapiens', 'male', 'age 29']
-                            # - Homo sapiens; symptomatic           --> ['Homo sapiens', 'symptomatic']
-                            if len(GBQualifier_value_text_list) > 1:
-                                host_sex = ''
-                                if 'female' in GBQualifier_value_text_list[1]:
-                                    host_sex = 'female'
-                                elif 'male' in GBQualifier_value_text_list[1]:
-                                    host_sex = 'male'
-                                elif 'gender' in GBQualifier_value_text_list[1]:
+
+       elif 'gender' in GBQualifier_value_text_list[1]:
                                     host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip()
                                     if host_sex_one_lecter in ['F', 'M']:
                                         host_sex = 'female' if host_sex_one_lecter == 'F' else 'male'