From b112b3203e34ea61dfdf802bce5036f938eaa774 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Fri, 8 Jan 2021 10:33:13 +0000 Subject: GenBank: Fix normalization and depth differences with original records --- workflows/pull-data/genbank/.guix-run | 2 +- workflows/pull-data/genbank/genbank.py | 17 ++++------ workflows/tools/normalize-yamlfa.py | 6 ++-- workflows/tools/normalize/mapping.py | 59 +++++++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 30 deletions(-) diff --git a/workflows/pull-data/genbank/.guix-run b/workflows/pull-data/genbank/.guix-run index 6db7871..f6b1a0c 100644 --- a/workflows/pull-data/genbank/.guix-run +++ b/workflows/pull-data/genbank/.guix-run @@ -4,5 +4,5 @@ echo # next run: echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH' -~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby +~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 026c03f..a994055 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -90,10 +90,9 @@ def get_metadata(id, gbseq): except AttributeError: warn("Missing "+msg) - host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606" sample.sample_id = id sample.database = "https://www.ncbi.nlm.nih.gov/genbank/" - sample.source_database_accession = f"http://identifiers.org/insdc/{id}#sequence" + sample.source_database_accession = [ f"http://identifiers.org/insdc/{id}#sequence" ] # USA: Cruise_Ship_1, California n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='country']/../GBQualifier_value") if n: sample.collection_location = n @@ -112,7 +111,7 @@ def get_metadata(id, gbseq): if n != 'Unpublished': institute,address = n.split(',',1) if ")" in institute: - submitter.submitter_name = institute.split(')')[1] + submitter.submitter_name = [institute.split(')')[1].strip()] submitter.submitter_address = address.strip() except AttributeError: pass @@ -129,13 +128,13 @@ def get_metadata(id, gbseq): # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628' p = re.compile(r'.*Assembly Method :: ([^;]+).*') m = p.match(n) - if m: technology.alignment_protocol = m.group(1) + if m: technology.alignment_protocol = m.group(1).strip() p = re.compile(r'.*Coverage :: ([^;]+).*') m = p.match(n) if m: technology.sequencing_coverage = m.group(1) p = re.compile(r'.*Sequencing Technology :: ([^;]+).*') m = p.match(n) - if m: technology.sample_sequencing_technology = m.group(1).strip() + if m: technology.sample_sequencing_technology = [m.group(1).strip()] else: warn("Missing sample_sequencing_technology") # --- Dates @@ -167,10 +166,7 @@ def get_metadata(id, gbseq): n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value") if n: list = n.split('; ') - species = list[0] - host.host_species = species - if species != "Homo sapiens": - warn(f"Species not understood: {species}") + host.host_species = list[0] if len(list)>1: sex = list[1] if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male' @@ -183,13 +179,12 @@ def get_metadata(id, gbseq): if m: host.host_age = int(m.group(1)) host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036' - # sys.exit(1) n = fetch("virus_strain", ".//GBQualifier/GBQualifier_name/[.='isolate']/../GBQualifier_value") if n: virus.virus_strain = n n = fetch("virus_species", ".//GBQualifier/GBQualifier_name/[.='db_xref']/../GBQualifier_value") if n: virus.virus_species = "http://purl.obolibrary.org/obo/NCBITaxon_"+n.split('taxon:')[1] n = fetch("specimen_source", ".//GBQualifier/GBQualifier_name/[.='isolation_source']/../GBQualifier_value") - if n: sample.specimen_source = n + if n: sample.specimen_source = [n] info = { 'id': 'placeholder', diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py index 20c2feb..55a8848 100755 --- a/workflows/tools/normalize-yamlfa.py +++ b/workflows/tools/normalize-yamlfa.py @@ -19,10 +19,10 @@ directory are parsed using the state.json file. It is possible to select a subset of IDs. This tool has two modes of operation. It can validate with the -`--validate` switch which stops at a warning and does no rewriting. +--validate switch which stops at a warning and does no rewriting. This mode is typically used in troubleshooting. -The other mode is `--rewrite` which rewrites the JSON files after +The other mode is --rewrite which rewrites the JSON files after making a backup (.bak) of the original. This mode updates files and won't stop - it is used for (automated) uploads. @@ -92,6 +92,6 @@ for id in ids: os.rename(fn,fn+".bak") with open(fn, 'w') as outfile: print(f" Writing {fn}") - json.dump(rec.__dict__, outfile, indent=4) + json.dump(rec.__dict__, outfile, indent=2) else: print(rec) diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py index bc82fea..3ed09c2 100644 --- a/workflows/tools/normalize/mapping.py +++ b/workflows/tools/normalize/mapping.py @@ -17,14 +17,36 @@ import re import types def host_species(host,mapping): + Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606" + + SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed + "human": Homo_sapiens, + "sapiens": Homo_sapiens, + "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666", + "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974", + "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685", + "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130", + "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615", + # Mink: + "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646" + } + warning = None host = types.SimpleNamespace(**host) if not 'obolibrary' in host.host_species: key = host.host_species + host.host_species = None if key in mapping: host.host_species = mapping[key] else: + for term in SPECIES_TERMS: + p = re.compile(".*?"+term,re.IGNORECASE) + m = p.match(key) + if m: host.host_species = SPECIES_TERMS[term] + if not host.host_species: warning = f"No URI mapping for host_species <{key}>" + if host.host_species == Unknown or host.host_species == None: + del(host.host_species) return host.__dict__,warning Unknown = "Not found" # So as not to create a warning @@ -35,8 +57,10 @@ def specimen_source(sample,mapping): Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831" Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" - Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119" + Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119" Frozen_Food = "https://www.wikidata.org/wiki/Q751728" + Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195", + Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699" SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed "Oronasopharynx": Oronasopharynx, "orophar": Oropharyngeal, @@ -44,28 +68,33 @@ def specimen_source(sample,mapping): "\snares": Nasal_Swab, "saliva": Saliva, "swab": Nasal_Swab, + "broncho": Bronchoalveolar_Lavage, "seafood": Frozen_Food, "packaging": Frozen_Food, + "specimen": Biospecimen, + "patient": Biospecimen, "uknown": Unknown, "unknown": Unknown } warning = None sample = types.SimpleNamespace(**sample) try: - if sample.specimen_source and \ - not 'obolibrary' in sample.specimen_source and \ - not 'wikidata' in sample.specimen_source: - key = sample.specimen_source - sample.specimen_source = None - if key in mapping: - sample.specimen_source = mapping[key] - else: - for term in SPECIMEN_TERMS: - p = re.compile(".*?"+term,re.IGNORECASE) - m = p.match(key) - if m: sample.specimen_source = SPECIMEN_TERMS[term] - if not sample.specimen_source: - warning = f"No URI mapping for specimen_source <{key}>" + if sample.specimen_source: + keys = sample.specimen_source + sample.specimen_source = [] + for key in keys: + if 'obolibrary' in key: + sample.specimen_source.append(key) + continue + if key in mapping: + sample.specimen_source.append(mapping[key]) + else: + for term in SPECIMEN_TERMS: + p = re.compile(".*?"+term,re.IGNORECASE) + m = p.match(key) + if m: sample.specimen_source = [SPECIMEN_TERMS[term]] + if len(sample.specimen_source)==0: + warning = f"No URI mapping for specimen_source <{key}>" if sample.specimen_source == Unknown or sample.specimen_source == None: del(sample.specimen_source) except AttributeError: -- cgit v1.2.3