From b112b3203e34ea61dfdf802bce5036f938eaa774 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Fri, 8 Jan 2021 10:33:13 +0000
Subject: GenBank: Fix normalization and depth differences with original
 records

---
 workflows/pull-data/genbank/.guix-run  |  2 +-
 workflows/pull-data/genbank/genbank.py | 17 ++++------
 workflows/tools/normalize-yamlfa.py    |  6 ++--
 workflows/tools/normalize/mapping.py   | 59 +++++++++++++++++++++++++---------
 4 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'workflows')

diff --git a/workflows/pull-data/genbank/.guix-run b/workflows/pull-data/genbank/.guix-run
index 6db7871..f6b1a0c 100644
--- a/workflows/pull-data/genbank/.guix-run
+++ b/workflows/pull-data/genbank/.guix-run
@@ -4,5 +4,5 @@
 echo # next run:
 echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH'
 
-~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby
+~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq
 
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 026c03f..a994055 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -90,10 +90,9 @@ def get_metadata(id, gbseq):
         except AttributeError:
             warn("Missing "+msg)
 
-    host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
     sample.sample_id = id
     sample.database = "https://www.ncbi.nlm.nih.gov/genbank/"
-    sample.source_database_accession = f"http://identifiers.org/insdc/{id}#sequence"
+    sample.source_database_accession = [ f"http://identifiers.org/insdc/{id}#sequence" ]
     #   <GBQualifier_value>USA: Cruise_Ship_1, California</GBQualifier_value>
     n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='country']/../GBQualifier_value")
     if n: sample.collection_location = n
@@ -112,7 +111,7 @@ def get_metadata(id, gbseq):
         if n != 'Unpublished':
             institute,address = n.split(',',1)
             if ")" in institute:
-                submitter.submitter_name = institute.split(')')[1]
+                submitter.submitter_name = [institute.split(')')[1].strip()]
             submitter.submitter_address = address.strip()
     except AttributeError:
         pass
@@ -129,13 +128,13 @@ def get_metadata(id, gbseq):
         # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
         p = re.compile(r'.*Assembly Method :: ([^;]+).*')
         m = p.match(n)
-        if m: technology.alignment_protocol = m.group(1)
+        if m: technology.alignment_protocol = m.group(1).strip()
         p = re.compile(r'.*Coverage :: ([^;]+).*')
         m = p.match(n)
         if m: technology.sequencing_coverage = m.group(1)
         p = re.compile(r'.*Sequencing Technology :: ([^;]+).*')
         m = p.match(n)
-        if m: technology.sample_sequencing_technology = m.group(1).strip()
+        if m: technology.sample_sequencing_technology = [m.group(1).strip()]
         else: warn("Missing sample_sequencing_technology")
 
     # --- Dates
@@ -167,10 +166,7 @@ def get_metadata(id, gbseq):
     n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value")
     if n:
         list = n.split('; ')
-        species = list[0]
-        host.host_species = species
-        if species != "Homo sapiens":
-            warn(f"Species not understood: {species}")
+        host.host_species = list[0]
         if len(list)>1:
             sex = list[1]
             if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male'
@@ -183,13 +179,12 @@ def get_metadata(id, gbseq):
             if m:
                 host.host_age = int(m.group(1))
                 host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036'
-        # sys.exit(1)
     n = fetch("virus_strain", ".//GBQualifier/GBQualifier_name/[.='isolate']/../GBQualifier_value")
     if n: virus.virus_strain = n
     n = fetch("virus_species", ".//GBQualifier/GBQualifier_name/[.='db_xref']/../GBQualifier_value")
     if n: virus.virus_species = "http://purl.obolibrary.org/obo/NCBITaxon_"+n.split('taxon:')[1]
     n = fetch("specimen_source", ".//GBQualifier/GBQualifier_name/[.='isolation_source']/../GBQualifier_value")
-    if n: sample.specimen_source = n
+    if n: sample.specimen_source = [n]
 
     info = {
         'id': 'placeholder',
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
index 20c2feb..55a8848 100755
--- a/workflows/tools/normalize-yamlfa.py
+++ b/workflows/tools/normalize-yamlfa.py
@@ -19,10 +19,10 @@ directory are parsed using the state.json file. It is possible
 to select a subset of IDs.
 
 This tool has two modes of operation. It can validate with the
-`--validate` switch which stops at a warning and does no rewriting.
+--validate switch which stops at a warning and does no rewriting.
 This mode is typically used in troubleshooting.
 
-The other mode is `--rewrite` which rewrites the JSON files after
+The other mode is --rewrite which rewrites the JSON files after
 making a backup (.bak) of the original. This mode updates files and
 won't stop - it is used for (automated) uploads.
 
@@ -92,6 +92,6 @@ for id in ids:
                 os.rename(fn,fn+".bak")
             with open(fn, 'w') as outfile:
                 print(f"    Writing {fn}")
-                json.dump(rec.__dict__, outfile, indent=4)
+                json.dump(rec.__dict__, outfile, indent=2)
         else:
             print(rec)
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
index bc82fea..3ed09c2 100644
--- a/workflows/tools/normalize/mapping.py
+++ b/workflows/tools/normalize/mapping.py
@@ -17,14 +17,36 @@ import re
 import types
 
 def host_species(host,mapping):
+    Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+
+    SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+        "human": Homo_sapiens,
+        "sapiens": Homo_sapiens,
+        "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
+        "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
+        "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
+        "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
+        "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
+        # Mink:
+        "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
+        }
+
     warning = None
     host = types.SimpleNamespace(**host)
     if not 'obolibrary' in host.host_species:
         key = host.host_species
+        host.host_species = None
         if key in mapping:
             host.host_species = mapping[key]
         else:
+            for term in SPECIES_TERMS:
+                p = re.compile(".*?"+term,re.IGNORECASE)
+                m = p.match(key)
+                if m: host.host_species = SPECIES_TERMS[term]
+        if not host.host_species:
             warning = f"No URI mapping for host_species <{key}>"
+        if host.host_species == Unknown or host.host_species == None:
+            del(host.host_species)
     return host.__dict__,warning
 
 Unknown = "Not found" # So as not to create a warning
@@ -35,8 +57,10 @@ def specimen_source(sample,mapping):
     Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
     Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
     Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
-    Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
+    Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
     Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+    Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
+    Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
     SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
         "Oronasopharynx": Oronasopharynx,
         "orophar": Oropharyngeal,
@@ -44,28 +68,33 @@ def specimen_source(sample,mapping):
         "\snares": Nasal_Swab,
         "saliva": Saliva,
         "swab": Nasal_Swab,
+        "broncho": Bronchoalveolar_Lavage,
         "seafood": Frozen_Food,
         "packaging": Frozen_Food,
+        "specimen": Biospecimen,
+        "patient": Biospecimen,
         "uknown": Unknown,
         "unknown": Unknown
         }
     warning = None
     sample = types.SimpleNamespace(**sample)
     try:
-        if sample.specimen_source and \
-           not 'obolibrary' in sample.specimen_source and \
-           not 'wikidata' in sample.specimen_source:
-            key = sample.specimen_source
-            sample.specimen_source = None
-            if key in mapping:
-                sample.specimen_source = mapping[key]
-            else:
-                for term in SPECIMEN_TERMS:
-                    p = re.compile(".*?"+term,re.IGNORECASE)
-                    m = p.match(key)
-                    if m: sample.specimen_source = SPECIMEN_TERMS[term]
-        if not sample.specimen_source:
-            warning = f"No URI mapping for specimen_source <{key}>"
+        if sample.specimen_source:
+            keys = sample.specimen_source
+            sample.specimen_source = []
+            for key in keys:
+                if 'obolibrary' in key:
+                    sample.specimen_source.append(key)
+                    continue
+                if key in mapping:
+                    sample.specimen_source.append(mapping[key])
+                else:
+                    for term in SPECIMEN_TERMS:
+                        p = re.compile(".*?"+term,re.IGNORECASE)
+                        m = p.match(key)
+                        if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
+                if len(sample.specimen_source)==0:
+                    warning = f"No URI mapping for specimen_source <{key}>"
         if sample.specimen_source == Unknown or sample.specimen_source == None:
             del(sample.specimen_source)
     except AttributeError:
-- 
cgit 1.4.1