Moving tools out of submodules (sorry!)

author: Pjotr Prins 2021-01-28 18:45:52 +0000
committer: Pjotr Prins 2021-01-28 18:45:52 +0000
commit: 8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3 (patch)
tree: 3d17dd32522df3cfa808e8df6ebf722a70cc01d3 /workflows/pubseq/normalize
parent: 90470bc795a17a6ddf6dca156f507d02cb056ec3 (diff)
download: bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.gz
bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.lz
bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.zip
3 files changed, 116 insertions, 0 deletions
diff --git a/workflows/pubseq/normalize/README.md b/workflows/pubseq/normalize/README.md
new file mode 100644
index 0000000..b780a68
--- /dev/null
+++ b/workflows/pubseq/normalize/README.md
@@ -0,0 +1,14 @@
+# Normalization steps
+
+This library contains generic logic to normalize (string) data and
+transforms strings to URIs.  It should be applicable to data from
+any source (GenBank, ENA etc).
+
+Important: missing data should be missing or None! Do not fill
+in data by 'guessing'.
+
+When data is malformed a warning should be logged and added to the
+warning list. Functions should be small enough to return only 1
+warning!
+
+Pjotr Prins (c) 2021
diff --git a/workflows/pubseq/normalize/__init__.py b/workflows/pubseq/normalize/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/workflows/pubseq/normalize/__init__.py
diff --git a/workflows/pubseq/normalize/mapping.py b/workflows/pubseq/normalize/mapping.py
new file mode 100644
index 0000000..3ed09c2
--- /dev/null
+++ b/workflows/pubseq/normalize/mapping.py
@@ -0,0 +1,102 @@
+# Normalization steps
+#
+# This library contains generic logic to normalize (string) data and
+# transforms strings to URIs.  It should be applicable to data from
+# any source (GenBank, ENA etc).
+#
+#   Important: missing data should be missing or None! Do not fill
+#   in data by 'guessing'.
+#
+#   When data is malformed a warning should be logged and added to the
+#   warning list. Functions should be small enough to return only 1
+#   warning!
+#
+#   Pjotr Prins (c) 2021
+
+import re
+import types
+
+def host_species(host,mapping):
+    Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+
+    SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+        "human": Homo_sapiens,
+        "sapiens": Homo_sapiens,
+        "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
+        "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
+        "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
+        "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
+        "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
+        # Mink:
+        "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
+        }
+
+    warning = None
+    host = types.SimpleNamespace(**host)
+    if not 'obolibrary' in host.host_species:
+        key = host.host_species
+        host.host_species = None
+        if key in mapping:
+            host.host_species = mapping[key]
+        else:
+            for term in SPECIES_TERMS:
+                p = re.compile(".*?"+term,re.IGNORECASE)
+                m = p.match(key)
+                if m: host.host_species = SPECIES_TERMS[term]
+        if not host.host_species:
+            warning = f"No URI mapping for host_species <{key}>"
+        if host.host_species == Unknown or host.host_species == None:
+            del(host.host_species)
+    return host.__dict__,warning
+
+Unknown = "Not found" # So as not to create a warning
+
+def specimen_source(sample,mapping):
+    Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835"
+    Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835"
+    Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
+    Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
+    Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
+    Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
+    Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+    Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
+    Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
+    SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+        "Oronasopharynx": Oronasopharynx,
+        "orophar": Oropharyngeal,
+        "pharyngeal": Nasopharyngeal,
+        "\snares": Nasal_Swab,
+        "saliva": Saliva,
+        "swab": Nasal_Swab,
+        "broncho": Bronchoalveolar_Lavage,
+        "seafood": Frozen_Food,
+        "packaging": Frozen_Food,
+        "specimen": Biospecimen,
+        "patient": Biospecimen,
+        "uknown": Unknown,
+        "unknown": Unknown
+        }
+    warning = None
+    sample = types.SimpleNamespace(**sample)
+    try:
+        if sample.specimen_source:
+            keys = sample.specimen_source
+            sample.specimen_source = []
+            for key in keys:
+                if 'obolibrary' in key:
+                    sample.specimen_source.append(key)
+                    continue
+                if key in mapping:
+                    sample.specimen_source.append(mapping[key])
+                else:
+                    for term in SPECIMEN_TERMS:
+                        p = re.compile(".*?"+term,re.IGNORECASE)
+                        m = p.match(key)
+                        if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
+                if len(sample.specimen_source)==0:
+                    warning = f"No URI mapping for specimen_source <{key}>"
+        if sample.specimen_source == Unknown or sample.specimen_source == None:
+            del(sample.specimen_source)
+    except AttributeError:
+        pass
+    return sample.__dict__,warning
author	Pjotr Prins	2021-01-28 18:45:52 +0000
committer	Pjotr Prins	2021-01-28 18:45:52 +0000
commit	8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3 (patch)
tree	3d17dd32522df3cfa808e8df6ebf722a70cc01d3 /workflows/pubseq/normalize
parent	90470bc795a17a6ddf6dca156f507d02cb056ec3 (diff)
download	bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.gz bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.lz bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.zip