aboutsummaryrefslogtreecommitdiff
path: root/workflows/tools/normalize
diff options
context:
space:
mode:
authorPjotr Prins2021-01-08 10:33:13 +0000
committerPjotr Prins2021-01-08 10:33:13 +0000
commitb112b3203e34ea61dfdf802bce5036f938eaa774 (patch)
treeda2ed8f18fcb7263fcec6f77f27ae40a51e6321d /workflows/tools/normalize
parent5d941e47e27a3cd2c47f8ee51f5523d180f8a5b2 (diff)
downloadbh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.tar.gz
bh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.tar.lz
bh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.zip
GenBank: Fix normalization and depth differences with original records
Diffstat (limited to 'workflows/tools/normalize')
-rw-r--r--workflows/tools/normalize/mapping.py59
1 files changed, 44 insertions, 15 deletions
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
index bc82fea..3ed09c2 100644
--- a/workflows/tools/normalize/mapping.py
+++ b/workflows/tools/normalize/mapping.py
@@ -17,14 +17,36 @@ import re
import types
def host_species(host,mapping):
+ Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+
+ SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+ "human": Homo_sapiens,
+ "sapiens": Homo_sapiens,
+ "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
+ "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
+ "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
+ "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
+ "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
+ # Mink:
+ "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
+ }
+
warning = None
host = types.SimpleNamespace(**host)
if not 'obolibrary' in host.host_species:
key = host.host_species
+ host.host_species = None
if key in mapping:
host.host_species = mapping[key]
else:
+ for term in SPECIES_TERMS:
+ p = re.compile(".*?"+term,re.IGNORECASE)
+ m = p.match(key)
+ if m: host.host_species = SPECIES_TERMS[term]
+ if not host.host_species:
warning = f"No URI mapping for host_species <{key}>"
+ if host.host_species == Unknown or host.host_species == None:
+ del(host.host_species)
return host.__dict__,warning
Unknown = "Not found" # So as not to create a warning
@@ -35,8 +57,10 @@ def specimen_source(sample,mapping):
Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
- Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
+ Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+ Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
+ Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
"Oronasopharynx": Oronasopharynx,
"orophar": Oropharyngeal,
@@ -44,28 +68,33 @@ def specimen_source(sample,mapping):
"\snares": Nasal_Swab,
"saliva": Saliva,
"swab": Nasal_Swab,
+ "broncho": Bronchoalveolar_Lavage,
"seafood": Frozen_Food,
"packaging": Frozen_Food,
+ "specimen": Biospecimen,
+ "patient": Biospecimen,
"uknown": Unknown,
"unknown": Unknown
}
warning = None
sample = types.SimpleNamespace(**sample)
try:
- if sample.specimen_source and \
- not 'obolibrary' in sample.specimen_source and \
- not 'wikidata' in sample.specimen_source:
- key = sample.specimen_source
- sample.specimen_source = None
- if key in mapping:
- sample.specimen_source = mapping[key]
- else:
- for term in SPECIMEN_TERMS:
- p = re.compile(".*?"+term,re.IGNORECASE)
- m = p.match(key)
- if m: sample.specimen_source = SPECIMEN_TERMS[term]
- if not sample.specimen_source:
- warning = f"No URI mapping for specimen_source <{key}>"
+ if sample.specimen_source:
+ keys = sample.specimen_source
+ sample.specimen_source = []
+ for key in keys:
+ if 'obolibrary' in key:
+ sample.specimen_source.append(key)
+ continue
+ if key in mapping:
+ sample.specimen_source.append(mapping[key])
+ else:
+ for term in SPECIMEN_TERMS:
+ p = re.compile(".*?"+term,re.IGNORECASE)
+ m = p.match(key)
+ if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
+ if len(sample.specimen_source)==0:
+ warning = f"No URI mapping for specimen_source <{key}>"
if sample.specimen_source == Unknown or sample.specimen_source == None:
del(sample.specimen_source)
except AttributeError: