about summary refs log tree commit diff
path: root/workflows
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 09:40:54 +0000
committerPjotr Prins2021-01-04 09:40:54 +0000
commit3593b3f6a835f6c5927cbb1cc79e3db3c5d0053a (patch)
tree6151f218cef5d4412118844ebce65d732f4b17b7 /workflows
parent1c4e055b8a9dc53b7fdbdf12d4b0a7e877fbc2ef (diff)
downloadbh20-seq-resource-3593b3f6a835f6c5927cbb1cc79e3db3c5d0053a.tar.gz
bh20-seq-resource-3593b3f6a835f6c5927cbb1cc79e3db3c5d0053a.tar.lz
bh20-seq-resource-3593b3f6a835f6c5927cbb1cc79e3db3c5d0053a.zip
mapping sample_species using regex
Diffstat (limited to 'workflows')
-rw-r--r--workflows/pull-data/genbank/ref.py19
-rw-r--r--workflows/tools/normalize/mapping.py28
2 files changed, 24 insertions, 23 deletions
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index 4d4df48..d2a377e 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -21,22 +21,3 @@ elif GBQualifier_name_text == 'collected_by':
 elif GBQualifier_name_text == 'isolation_source':
 if GBQualifier_value_text.upper() in field_to_term_to_uri_dict['ncbi_speciesman_source']:
     GBQualifier_value_text = GBQualifier_value_text.upper()  # For example, in case of 'usa: wa'
-
-# Little cleaning
-GBQualifier_value_text = GBQualifier_value_text.strip("/'")
-
-if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_speciesman_source']:
-    sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][GBQualifier_value_text]]
-else:
-    if GBQualifier_value_text.lower() in ['np/op', 'np-op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']:
-        sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
-    elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']:
-        sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
-    elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']:
-        sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
-    elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
-        sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']]
-    elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
-        sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']]
-    else:
-        missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
index 1d52b03..d2af3b5 100644
--- a/workflows/tools/normalize/mapping.py
+++ b/workflows/tools/normalize/mapping.py
@@ -13,6 +13,7 @@
 #
 #   Pjotr Prins (c) 2021
 
+import re
 import types
 
 def host_species(host,mapping):
@@ -26,18 +27,37 @@ def host_species(host,mapping):
             warning = f"No URI mapping for host_species <{key}>"
     return host.__dict__,warning
 
+Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
+Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
+Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
+Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+
 def specimen_source(sample,mapping):
+    SPECIMEN_TERMS = {
+        r".*swab": Nasal_Swab,
+        "saliva": Saliva,
+        "seafood": Frozen_Food,
+        "packaging": Frozen_Food
+        }
     warning = None
     sample = types.SimpleNamespace(**sample)
+
     try:
-        if sample.specimen_source and not 'obolibrary' in sample.specimen_source:
+        if sample.specimen_source and \
+           not 'obolibrary' in sample.specimen_source and \
+           not 'wikidata' in sample.specimen_source:
             key = sample.specimen_source
+            sample.specimen_source = None
             if key in mapping:
                 sample.specimen_source = mapping[key]
             else:
-                sample.specimen_source = None
-                warning = f"No URI mapping for specimen_source <{key}>"
+                for term in SPECIMEN_TERMS:
+                    p = re.compile(term,re.IGNORECASE)
+                    m = p.match(key)
+                    if m: sample.specimen_source = SPECIMEN_TERMS[term]
+        if not sample.specimen_source:
+            warning = f"No URI mapping for specimen_source <{key}>"
+        if sample.specimen_source == None: del(sample.specimen_source)
     except AttributeError:
         pass
-    if not sample.specimen_source: del(sample.specimen_source)
     return sample.__dict__,warning