aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools/normalize
diff options
context:
space:
mode:
authorPjotr Prins2021-01-28 18:45:52 +0000
committerPjotr Prins2021-01-28 18:45:52 +0000
commit8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3 (patch)
tree3d17dd32522df3cfa808e8df6ebf722a70cc01d3 /workflows/tools/normalize
parent90470bc795a17a6ddf6dca156f507d02cb056ec3 (diff)
downloadbh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.gz
bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.tar.lz
bh20-seq-resource-8a7e79d6daa06da4d8ca2a391bae0a00124a2ed3.zip
Moving tools out of submodules (sorry!)
Diffstat (limited to 'workflows/tools/normalize')
-rw-r--r--workflows/tools/normalize/README.md14
-rw-r--r--workflows/tools/normalize/__init__.py0
-rw-r--r--workflows/tools/normalize/mapping.py102
3 files changed, 0 insertions, 116 deletions
diff --git a/workflows/tools/normalize/README.md b/workflows/tools/normalize/README.md
deleted file mode 100644
index b780a68..0000000
--- a/workflows/tools/normalize/README.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Normalization steps
-
-This library contains generic logic to normalize (string) data and
-transforms strings to URIs. It should be applicable to data from
-any source (GenBank, ENA etc).
-
-Important: missing data should be missing or None! Do not fill
-in data by 'guessing'.
-
-When data is malformed a warning should be logged and added to the
-warning list. Functions should be small enough to return only 1
-warning!
-
-Pjotr Prins (c) 2021
diff --git a/workflows/tools/normalize/__init__.py b/workflows/tools/normalize/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/workflows/tools/normalize/__init__.py
+++ /dev/null
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
deleted file mode 100644
index 3ed09c2..0000000
--- a/workflows/tools/normalize/mapping.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Normalization steps
-#
-# This library contains generic logic to normalize (string) data and
-# transforms strings to URIs. It should be applicable to data from
-# any source (GenBank, ENA etc).
-#
-# Important: missing data should be missing or None! Do not fill
-# in data by 'guessing'.
-#
-# When data is malformed a warning should be logged and added to the
-# warning list. Functions should be small enough to return only 1
-# warning!
-#
-# Pjotr Prins (c) 2021
-
-import re
-import types
-
-def host_species(host,mapping):
- Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
-
- SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
- "human": Homo_sapiens,
- "sapiens": Homo_sapiens,
- "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
- "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
- "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
- "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
- "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
- # Mink:
- "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
- }
-
- warning = None
- host = types.SimpleNamespace(**host)
- if not 'obolibrary' in host.host_species:
- key = host.host_species
- host.host_species = None
- if key in mapping:
- host.host_species = mapping[key]
- else:
- for term in SPECIES_TERMS:
- p = re.compile(".*?"+term,re.IGNORECASE)
- m = p.match(key)
- if m: host.host_species = SPECIES_TERMS[term]
- if not host.host_species:
- warning = f"No URI mapping for host_species <{key}>"
- if host.host_species == Unknown or host.host_species == None:
- del(host.host_species)
- return host.__dict__,warning
-
-Unknown = "Not found" # So as not to create a warning
-
-def specimen_source(sample,mapping):
- Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835"
- Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835"
- Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
- Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
- Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
- Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
- Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
- Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
- Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
- SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
- "Oronasopharynx": Oronasopharynx,
- "orophar": Oropharyngeal,
- "pharyngeal": Nasopharyngeal,
- "\snares": Nasal_Swab,
- "saliva": Saliva,
- "swab": Nasal_Swab,
- "broncho": Bronchoalveolar_Lavage,
- "seafood": Frozen_Food,
- "packaging": Frozen_Food,
- "specimen": Biospecimen,
- "patient": Biospecimen,
- "uknown": Unknown,
- "unknown": Unknown
- }
- warning = None
- sample = types.SimpleNamespace(**sample)
- try:
- if sample.specimen_source:
- keys = sample.specimen_source
- sample.specimen_source = []
- for key in keys:
- if 'obolibrary' in key:
- sample.specimen_source.append(key)
- continue
- if key in mapping:
- sample.specimen_source.append(mapping[key])
- else:
- for term in SPECIMEN_TERMS:
- p = re.compile(".*?"+term,re.IGNORECASE)
- m = p.match(key)
- if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
- if len(sample.specimen_source)==0:
- warning = f"No URI mapping for specimen_source <{key}>"
- if sample.specimen_source == Unknown or sample.specimen_source == None:
- del(sample.specimen_source)
- except AttributeError:
- pass
- return sample.__dict__,warning