aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools/normalize
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 08:58:38 +0000
committerPjotr Prins2021-01-04 08:58:38 +0000
commit1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (patch)
tree34cc42ef12b81c05be8a57ca2a973b97e52f8461 /workflows/tools/normalize
parentba4161b1660c3a67090dd3715e9862906fb1cc5f (diff)
downloadbh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.gz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.tar.lz
bh20-seq-resource-1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef.zip
Started on normalization
Diffstat (limited to 'workflows/tools/normalize')
-rw-r--r--workflows/tools/normalize/README.md14
-rw-r--r--workflows/tools/normalize/__init__.py0
-rw-r--r--workflows/tools/normalize/mapping.py43
3 files changed, 57 insertions, 0 deletions
diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md
new file mode 100644
index 0000000..b780a68
--- /dev/null
+++ b/workflows/tools/normalize/README.md
@@ -0,0 +1,14 @@
+# Normalization steps
+
+This library contains generic logic to normalize (string) data and
+transforms strings to URIs. It should be applicable to data from
+any source (GenBank, ENA etc).
+
+Important: missing data should be missing or None! Do not fill
+in data by 'guessing'.
+
+When data is malformed a warning should be logged and added to the
+warning list. Functions should be small enough to return only 1
+warning!
+
+Pjotr Prins (c) 2021
diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/workflows/tools/normalize/__init__.py
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
new file mode 100644
index 0000000..1d52b03
--- /dev/null
+++ b/workflows/tools/normalize/mapping.py
@@ -0,0 +1,43 @@
+# Normalization steps
+#
+# This library contains generic logic to normalize (string) data and
+# transforms strings to URIs. It should be applicable to data from
+# any source (GenBank, ENA etc).
+#
+# Important: missing data should be missing or None! Do not fill
+# in data by 'guessing'.
+#
+# When data is malformed a warning should be logged and added to the
+# warning list. Functions should be small enough to return only 1
+# warning!
+#
+# Pjotr Prins (c) 2021
+
+import types
+
+def host_species(host,mapping):
+ warning = None
+ host = types.SimpleNamespace(**host)
+ if not 'obolibrary' in host.host_species:
+ key = host.host_species
+ if key in mapping:
+ host.host_species = mapping[key]
+ else:
+ warning = f"No URI mapping for host_species <{key}>"
+ return host.__dict__,warning
+
+def specimen_source(sample,mapping):
+ warning = None
+ sample = types.SimpleNamespace(**sample)
+ try:
+ if sample.specimen_source and not 'obolibrary' in sample.specimen_source:
+ key = sample.specimen_source
+ if key in mapping:
+ sample.specimen_source = mapping[key]
+ else:
+ sample.specimen_source = None
+ warning = f"No URI mapping for specimen_source <{key}>"
+ except AttributeError:
+ pass
+ if not sample.specimen_source: del(sample.specimen_source)
+ return sample.__dict__,warning