From 3593b3f6a835f6c5927cbb1cc79e3db3c5d0053a Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Mon, 4 Jan 2021 09:40:54 +0000 Subject: mapping sample_species using regex --- workflows/pull-data/genbank/ref.py | 19 ------------------- workflows/tools/normalize/mapping.py | 28 ++++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py index 4d4df48..d2a377e 100644 --- a/workflows/pull-data/genbank/ref.py +++ b/workflows/pull-data/genbank/ref.py @@ -21,22 +21,3 @@ elif GBQualifier_name_text == 'collected_by': elif GBQualifier_name_text == 'isolation_source': if GBQualifier_value_text.upper() in field_to_term_to_uri_dict['ncbi_speciesman_source']: GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa' - -# Little cleaning -GBQualifier_value_text = GBQualifier_value_text.strip("/'") - -if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_speciesman_source']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source'][GBQualifier_value_text]] -else: - if GBQualifier_value_text.lower() in ['np/op', 'np-op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab', 'naso and/or oropharyngeal swab']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] - elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab', 'Nasopharyngeal/Throat']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] - elif GBQualifier_value_text.lower() in ['nasopharyngeal aspirate & throat swab', 'nasopharyngeal aspirate and throat swab']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasopharyngeal aspirate'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] - elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['throat swab']] - elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']: - sample['specimen_source'] = [field_to_term_to_uri_dict['ncbi_speciesman_source']['nasal swab'], field_to_term_to_uri_dict['ncbi_speciesman_source']['oropharyngeal swab']] - else: - missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text])) diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py index 1d52b03..d2af3b5 100644 --- a/workflows/tools/normalize/mapping.py +++ b/workflows/tools/normalize/mapping.py @@ -13,6 +13,7 @@ # # Pjotr Prins (c) 2021 +import re import types def host_species(host,mapping): @@ -26,18 +27,37 @@ def host_species(host,mapping): warning = f"No URI mapping for host_species <{key}>" return host.__dict__,warning +Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" +Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" +Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119" +Frozen_Food = "https://www.wikidata.org/wiki/Q751728" + def specimen_source(sample,mapping): + SPECIMEN_TERMS = { + r".*swab": Nasal_Swab, + "saliva": Saliva, + "seafood": Frozen_Food, + "packaging": Frozen_Food + } warning = None sample = types.SimpleNamespace(**sample) + try: - if sample.specimen_source and not 'obolibrary' in sample.specimen_source: + if sample.specimen_source and \ + not 'obolibrary' in sample.specimen_source and \ + not 'wikidata' in sample.specimen_source: key = sample.specimen_source + sample.specimen_source = None if key in mapping: sample.specimen_source = mapping[key] else: - sample.specimen_source = None - warning = f"No URI mapping for specimen_source <{key}>" + for term in SPECIMEN_TERMS: + p = re.compile(term,re.IGNORECASE) + m = p.match(key) + if m: sample.specimen_source = SPECIMEN_TERMS[term] + if not sample.specimen_source: + warning = f"No URI mapping for specimen_source <{key}>" + if sample.specimen_source == None: del(sample.specimen_source) except AttributeError: pass - if not sample.specimen_source: del(sample.specimen_source) return sample.__dict__,warning -- cgit v1.2.3