aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2021-01-08 10:33:13 +0000
committerPjotr Prins2021-01-08 10:33:13 +0000
commitb112b3203e34ea61dfdf802bce5036f938eaa774 (patch)
treeda2ed8f18fcb7263fcec6f77f27ae40a51e6321d
parent5d941e47e27a3cd2c47f8ee51f5523d180f8a5b2 (diff)
downloadbh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.tar.gz
bh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.tar.lz
bh20-seq-resource-b112b3203e34ea61dfdf802bce5036f938eaa774.zip
GenBank: Fix normalization and depth differences with original records
-rw-r--r--workflows/pull-data/genbank/.guix-run2
-rw-r--r--workflows/pull-data/genbank/genbank.py17
-rwxr-xr-xworkflows/tools/normalize-yamlfa.py6
-rw-r--r--workflows/tools/normalize/mapping.py59
4 files changed, 54 insertions, 30 deletions
diff --git a/workflows/pull-data/genbank/.guix-run b/workflows/pull-data/genbank/.guix-run
index 6db7871..f6b1a0c 100644
--- a/workflows/pull-data/genbank/.guix-run
+++ b/workflows/pull-data/genbank/.guix-run
@@ -4,5 +4,5 @@
echo # next run:
echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH'
-~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby
+~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 026c03f..a994055 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -90,10 +90,9 @@ def get_metadata(id, gbseq):
except AttributeError:
warn("Missing "+msg)
- host.host_species = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
sample.sample_id = id
sample.database = "https://www.ncbi.nlm.nih.gov/genbank/"
- sample.source_database_accession = f"http://identifiers.org/insdc/{id}#sequence"
+ sample.source_database_accession = [ f"http://identifiers.org/insdc/{id}#sequence" ]
# <GBQualifier_value>USA: Cruise_Ship_1, California</GBQualifier_value>
n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='country']/../GBQualifier_value")
if n: sample.collection_location = n
@@ -112,7 +111,7 @@ def get_metadata(id, gbseq):
if n != 'Unpublished':
institute,address = n.split(',',1)
if ")" in institute:
- submitter.submitter_name = institute.split(')')[1]
+ submitter.submitter_name = [institute.split(')')[1].strip()]
submitter.submitter_address = address.strip()
except AttributeError:
pass
@@ -129,13 +128,13 @@ def get_metadata(id, gbseq):
# technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
p = re.compile(r'.*Assembly Method :: ([^;]+).*')
m = p.match(n)
- if m: technology.alignment_protocol = m.group(1)
+ if m: technology.alignment_protocol = m.group(1).strip()
p = re.compile(r'.*Coverage :: ([^;]+).*')
m = p.match(n)
if m: technology.sequencing_coverage = m.group(1)
p = re.compile(r'.*Sequencing Technology :: ([^;]+).*')
m = p.match(n)
- if m: technology.sample_sequencing_technology = m.group(1).strip()
+ if m: technology.sample_sequencing_technology = [m.group(1).strip()]
else: warn("Missing sample_sequencing_technology")
# --- Dates
@@ -167,10 +166,7 @@ def get_metadata(id, gbseq):
n = fetch("host_species", ".//GBQualifier/GBQualifier_name/[.='host']/../GBQualifier_value")
if n:
list = n.split('; ')
- species = list[0]
- host.host_species = species
- if species != "Homo sapiens":
- warn(f"Species not understood: {species}")
+ host.host_species = list[0]
if len(list)>1:
sex = list[1]
if 'male' in sex or 'gender: M' in sex: host.host_sex = 'male'
@@ -183,13 +179,12 @@ def get_metadata(id, gbseq):
if m:
host.host_age = int(m.group(1))
host.host_age_unit = 'http://purl.obolibrary.org/obo/UO_0000036'
- # sys.exit(1)
n = fetch("virus_strain", ".//GBQualifier/GBQualifier_name/[.='isolate']/../GBQualifier_value")
if n: virus.virus_strain = n
n = fetch("virus_species", ".//GBQualifier/GBQualifier_name/[.='db_xref']/../GBQualifier_value")
if n: virus.virus_species = "http://purl.obolibrary.org/obo/NCBITaxon_"+n.split('taxon:')[1]
n = fetch("specimen_source", ".//GBQualifier/GBQualifier_name/[.='isolation_source']/../GBQualifier_value")
- if n: sample.specimen_source = n
+ if n: sample.specimen_source = [n]
info = {
'id': 'placeholder',
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
index 20c2feb..55a8848 100755
--- a/workflows/tools/normalize-yamlfa.py
+++ b/workflows/tools/normalize-yamlfa.py
@@ -19,10 +19,10 @@ directory are parsed using the state.json file. It is possible
to select a subset of IDs.
This tool has two modes of operation. It can validate with the
-`--validate` switch which stops at a warning and does no rewriting.
+--validate switch which stops at a warning and does no rewriting.
This mode is typically used in troubleshooting.
-The other mode is `--rewrite` which rewrites the JSON files after
+The other mode is --rewrite which rewrites the JSON files after
making a backup (.bak) of the original. This mode updates files and
won't stop - it is used for (automated) uploads.
@@ -92,6 +92,6 @@ for id in ids:
os.rename(fn,fn+".bak")
with open(fn, 'w') as outfile:
print(f" Writing {fn}")
- json.dump(rec.__dict__, outfile, indent=4)
+ json.dump(rec.__dict__, outfile, indent=2)
else:
print(rec)
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
index bc82fea..3ed09c2 100644
--- a/workflows/tools/normalize/mapping.py
+++ b/workflows/tools/normalize/mapping.py
@@ -17,14 +17,36 @@ import re
import types
def host_species(host,mapping):
+ Homo_sapiens = "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+
+ SPECIES_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+ "human": Homo_sapiens,
+ "sapiens": Homo_sapiens,
+ "Mustela lutreola": "http://purl.obolibrary.org/obo/NCBITaxon_9666",
+ "Manis javanica": "http://purl.obolibrary.org/obo/NCBITaxon_9974",
+ "Felis catus": "http://purl.obolibrary.org/obo/NCBITaxon_9685",
+ "Panthera tigris": "http://purl.obolibrary.org/obo/NCBITaxon_419130",
+ "Canis lupus": "http://purl.obolibrary.org/obo/NCBITaxon_9615",
+ # Mink:
+ "vison": "http://purl.obolibrary.org/obo/NCBITaxon_452646"
+ }
+
warning = None
host = types.SimpleNamespace(**host)
if not 'obolibrary' in host.host_species:
key = host.host_species
+ host.host_species = None
if key in mapping:
host.host_species = mapping[key]
else:
+ for term in SPECIES_TERMS:
+ p = re.compile(".*?"+term,re.IGNORECASE)
+ m = p.match(key)
+ if m: host.host_species = SPECIES_TERMS[term]
+ if not host.host_species:
warning = f"No URI mapping for host_species <{key}>"
+ if host.host_species == Unknown or host.host_species == None:
+ del(host.host_species)
return host.__dict__,warning
Unknown = "Not found" # So as not to create a warning
@@ -35,8 +57,10 @@ def specimen_source(sample,mapping):
Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
- Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
+ Nasal_Swab = Nasopharyngeal # "http://purl.obolibrary.org/obo/NCIT_C132119"
Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+ Bronchoalveolar_Lavage = "http://purl.obolibrary.org/obo/NCIT_C13195",
+ Biospecimen = "http://purl.obolibrary.org/obo/NCIT_C70699"
SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
"Oronasopharynx": Oronasopharynx,
"orophar": Oropharyngeal,
@@ -44,28 +68,33 @@ def specimen_source(sample,mapping):
"\snares": Nasal_Swab,
"saliva": Saliva,
"swab": Nasal_Swab,
+ "broncho": Bronchoalveolar_Lavage,
"seafood": Frozen_Food,
"packaging": Frozen_Food,
+ "specimen": Biospecimen,
+ "patient": Biospecimen,
"uknown": Unknown,
"unknown": Unknown
}
warning = None
sample = types.SimpleNamespace(**sample)
try:
- if sample.specimen_source and \
- not 'obolibrary' in sample.specimen_source and \
- not 'wikidata' in sample.specimen_source:
- key = sample.specimen_source
- sample.specimen_source = None
- if key in mapping:
- sample.specimen_source = mapping[key]
- else:
- for term in SPECIMEN_TERMS:
- p = re.compile(".*?"+term,re.IGNORECASE)
- m = p.match(key)
- if m: sample.specimen_source = SPECIMEN_TERMS[term]
- if not sample.specimen_source:
- warning = f"No URI mapping for specimen_source <{key}>"
+ if sample.specimen_source:
+ keys = sample.specimen_source
+ sample.specimen_source = []
+ for key in keys:
+ if 'obolibrary' in key:
+ sample.specimen_source.append(key)
+ continue
+ if key in mapping:
+ sample.specimen_source.append(mapping[key])
+ else:
+ for term in SPECIMEN_TERMS:
+ p = re.compile(".*?"+term,re.IGNORECASE)
+ m = p.match(key)
+ if m: sample.specimen_source = [SPECIMEN_TERMS[term]]
+ if len(sample.specimen_source)==0:
+ warning = f"No URI mapping for specimen_source <{key}>"
if sample.specimen_source == Unknown or sample.specimen_source == None:
del(sample.specimen_source)
except AttributeError: