aboutsummaryrefslogtreecommitdiff
path: root/workflows/pull-data/genbank
diff options
context:
space:
mode:
authorPjotr Prins2021-01-03 10:36:55 +0000
committerPjotr Prins2021-01-03 10:36:55 +0000
commit0de0ae0ac62e85f1ba6587a252b5a1164cbd5210 (patch)
tree79f9b1065ec8eb640eb50db9929a88aee2025daa /workflows/pull-data/genbank
parenta10c61d1f5afca70f54b762beefb402372d7fd31 (diff)
downloadbh20-seq-resource-0de0ae0ac62e85f1ba6587a252b5a1164cbd5210.tar.gz
bh20-seq-resource-0de0ae0ac62e85f1ba6587a252b5a1164cbd5210.tar.lz
bh20-seq-resource-0de0ae0ac62e85f1ba6587a252b5a1164cbd5210.zip
genbank: deal with host, sex and age
Diffstat (limited to 'workflows/pull-data/genbank')
-rw-r--r--workflows/pull-data/genbank/genbank.py48
-rw-r--r--workflows/pull-data/genbank/ref.py48
2 files changed, 46 insertions, 50 deletions
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 314f50d..90f5a14 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -70,6 +70,13 @@ def get_metadata(id, gbseq):
print(f"WARNING: {msg}",file=sys.stderr)
warnings.append(msg)
+ def fetch(msg, xpath):
+ try:
+ n = gbseq.find(xpath).text
+ return n
+ except AttributeError:
+ warn("Missing "+msg)
+
host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
sample.sample_id = id
sample.database = "https://www.ncbi.nlm.nih.gov/genbank/"
@@ -132,14 +139,47 @@ def get_metadata(id, gbseq):
warn("Missing collection_date")
sample.collection_date = None
+ # --- Host info
+ # - Homo sapiens
+ # - Homo sapiens; female
+ # - Homo sapiens; female 63
+ # - Homo sapiens; female; age 40
+ # - Homo sapiens; gender: F; age: 61
+ # - Homo sapiens; gender: M; age: 68
+ # - Homo sapiens; hospitalized patient
+ # - Homo sapiens; male
+ # - Homo sapiens; male; 63
+ # - Homo sapiens; male; age 29
+ # - Homo sapiens; symptomatic
+ n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value")
+ if n:
+ list = n.split('; ')
+ species = list[0]
+ host.host_species = species
+ if species != "Homo sapiens":
+ warn(f"Species not understood: {species}")
+ if len(list)>1:
+ sex = list[1]
+ if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male'
+ if 'female' in sex or 'gender: F' in sex: host.host_sex = 'female'
+ if len(list)>2:
+ age = list[2]
+ p = re.compile(r'[^\d]+(\d+)')
+ m = p.match(n)
+ print(m.group(1))
+ if m:
+ host.host_age = int(m.group(1))
+ host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036'
+ # sys.exit(1)
+
info = {
'id': 'placeholder',
'update_date': str(update_date),
- 'host': host,
- 'sample': sample,
+ 'host': host.__dict__,
+ 'sample': sample.__dict__,
#'virus': virus,
- 'technology': technology,
- 'submitter': submitter,
+ 'technology': technology.__dict__,
+ 'submitter': submitter.__dict__,
'warnings': warnings,
}
print(info)
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index d809d7f..a9b4f06 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -26,53 +26,9 @@
technology[field_in_yaml] = tech_info_to_parse
- for GBFeature in GBSeq.iter('GBFeature'):
- if GBFeature.find('GBFeature_key').text != 'source':
- continue
-
- for GBQualifier in GBFeature.iter('GBQualifier'):
- GBQualifier_value = GBQualifier.find('GBQualifier_value')
- if GBQualifier_value is None:
- continue
- GBQualifier_value_text = GBQualifier_value.text
-
- GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
-
- if GBQualifier_name_text == 'host':
- GBQualifier_value_text = GBQualifier_value_text.split(';')[0] # For case like Homo sapiens;sex:female
- if GBQualifier_value_text in field_to_term_to_uri_dict['ncbi_host_species']:
- # Cases like 'Felis catus; Domestic Shorthair'
- host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text]
- else:
- GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
- if GBQualifier_value_text_list[0] in field_to_term_to_uri_dict['ncbi_host_species']:
- host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species'][GBQualifier_value_text_list[0]]
- elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version):
- # Information checked manually from NCBI Virus
- host['host_species'] = field_to_term_to_uri_dict['ncbi_host_species']['Canis lupus familiaris']
- else:
- missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
-
- # Possible cases:
- # - Homo sapiens --> ['Homo sapiens']
- # - Homo sapiens; female --> ['Homo sapiens', 'female']
- # - Homo sapiens; female 63 --> ['Homo sapiens', 'female 63']
- # - Homo sapiens; female; age 40 --> ['Homo sapiens', 'female', 'age 40']
- # - Homo sapiens; gender: F; age: 61 --> ['Homo sapiens', 'gender: F', 'age: 61']
- # - Homo sapiens; gender: M; age: 68 --> ['Homo sapiens', 'gender: M', 'age: 68']
- # - Homo sapiens; hospitalized patient --> ['Homo sapiens', 'hospitalized patient']
- # - Homo sapiens; male --> ['Homo sapiens', 'male']
- # - Homo sapiens; male; 63 --> ['Homo sapiens', 'male', '63']
- # - Homo sapiens; male; age 29 --> ['Homo sapiens', 'male', 'age 29']
- # - Homo sapiens; symptomatic --> ['Homo sapiens', 'symptomatic']
- if len(GBQualifier_value_text_list) > 1:
- host_sex = ''
- if 'female' in GBQualifier_value_text_list[1]:
- host_sex = 'female'
- elif 'male' in GBQualifier_value_text_list[1]:
- host_sex = 'male'
- elif 'gender' in GBQualifier_value_text_list[1]:
+
+ elif 'gender' in GBQualifier_value_text_list[1]:
host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip()
if host_sex_one_lecter in ['F', 'M']:
host_sex = 'female' if host_sex_one_lecter == 'F' else 'male'