From 0de0ae0ac62e85f1ba6587a252b5a1164cbd5210 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 3 Jan 2021 10:36:55 +0000 Subject: genbank: deal with host, sex and age --- workflows/pull-data/genbank/genbank.py | 48 +++++++++++++++++++++++++++++++--- workflows/pull-data/genbank/ref.py | 48 ++-------------------------------- 2 files changed, 46 insertions(+), 50 deletions(-) diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 314f50d..90f5a14 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -70,6 +70,13 @@ def get_metadata(id, gbseq): print(f"WARNING: {msg}",file=sys.stderr) warnings.append(msg) + def fetch(msg, xpath): + try: + n = gbseq.find(xpath).text + return n + except AttributeError: + warn("Missing "+msg) + host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606" sample.sample_id = id sample.database = "https://www.ncbi.nlm.nih.gov/genbank/" @@ -132,14 +139,47 @@ def get_metadata(id, gbseq): warn("Missing collection_date") sample.collection_date = None + # --- Host info + # - Homo sapiens + # - Homo sapiens; female + # - Homo sapiens; female 63 + # - Homo sapiens; female; age 40 + # - Homo sapiens; gender: F; age: 61 + # - Homo sapiens; gender: M; age: 68 + # - Homo sapiens; hospitalized patient + # - Homo sapiens; male + # - Homo sapiens; male; 63 + # - Homo sapiens; male; age 29 + # - Homo sapiens; symptomatic + n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value") + if n: + list = n.split('; ') + species = list[0] + host.host_species = species + if species != "Homo sapiens": + warn(f"Species not understood: {species}") + if len(list)>1: + sex = list[1] + if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male' + if 'female' in sex or 'gender: F' in sex: host.host_sex = 'female' + if len(list)>2: + age = list[2] + p = re.compile(r'[^\d]+(\d+)') + m = p.match(n) + print(m.group(1)) + if m: + host.host_age = int(m.group(1)) + host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036' + # sys.exit(1) + info = { 'id': 'placeholder', 'update_date': str(update_date), - 'host': host, - 'sample': sample, + 'host': host.__dict__, + 'sample': sample.__dict__, #'virus': virus, - 'technology': technology, - 'submitter': submitter, + 'technology': technology.__dict__, + 'submitter': submitter.__dict__, 'warnings': warnings, } print(info) diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py index d809d7f..a9b4f06 100644 --- a/workflows/pull-data/genbank/ref.py +++ b/workflows/pull-data/genbank/ref.py @@ -26,53 +26,9 @@ technology[field_in_yaml] = tech_info_to_parse - for GBFeature in GBSeq.iter('GBFeature'): - if GBFeature.find('GBFeature_key').text != 'source': - continue - - for GBQualifier in GBFeature.iter('GBQualifier'): - GBQualifier_value = GBQualifier.find('GBQualifier_value') - if GBQualifier_value is None: - continue - GBQualifier_value_text = GBQualifier_value.text - - GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text - - if GBQualifier_name_text == 'host': - GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female - if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_host_species']: - # Cases like 'Felis catus; Domestic Shorthair' - host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text] - else: - GBQualifier_value_text_list = GBQualifier_value_text.split('; ') - if GBQualifier_value_text_list[0] in field_to_term_to_uri_dict['ncbi_host_species']: - host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text_list[0]] - elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version): - # Information checked manually from NCBI Virus - host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species']['Canis lupus familiaris'] - else: - missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]])) - - # Possible cases: - # - Homo sapiens --> ['Homo sapiens'] - # - Homo sapiens; female --> ['Homo sapiens', 'female'] - # - Homo sapiens; female 63 --> ['Homo sapiens', 'female 63'] - # - Homo sapiens; female; age 40 --> ['Homo sapiens', 'female', 'age 40'] - # - Homo sapiens; gender: F; age: 61 --> ['Homo sapiens', 'gender: F', 'age: 61'] - # - Homo sapiens; gender: M; age: 68 --> ['Homo sapiens', 'gender: M', 'age: 68'] - # - Homo sapiens; hospitalized patient --> ['Homo sapiens', 'hospitalized patient'] - # - Homo sapiens; male --> ['Homo sapiens', 'male'] - # - Homo sapiens; male; 63 --> ['Homo sapiens', 'male', '63'] - # - Homo sapiens; male; age 29 --> ['Homo sapiens', 'male', 'age 29'] - # - Homo sapiens; symptomatic --> ['Homo sapiens', 'symptomatic'] - if len(GBQualifier_value_text_list) > 1: - host_sex = '' - if 'female' in GBQualifier_value_text_list[1]: - host_sex = 'female' - elif 'male' in GBQualifier_value_text_list[1]: - host_sex = 'male' - elif 'gender' in GBQualifier_value_text_list[1]: + + elif 'gender' in GBQualifier_value_text_list[1]: host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip() if host_sex_one_lecter in ['F', 'M']: host_sex = 'female' if host_sex_one_lecter == 'F' else 'male' -- cgit v1.2.3