aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPjotr Prins2021-01-04 10:25:36 +0000
committerPjotr Prins2021-01-04 10:25:36 +0000
commitf9f27a787fef0ad58c1ae465d8ba1ee4634083ae (patch)
tree64bba4b6c496c9aa63aeb651bcc8252256a905bf
parentbf8f13af6f083d382b4a3900566ef5e329084cbf (diff)
downloadbh20-seq-resource-f9f27a787fef0ad58c1ae465d8ba1ee4634083ae.tar.gz
bh20-seq-resource-f9f27a787fef0ad58c1ae465d8ba1ee4634083ae.tar.lz
bh20-seq-resource-f9f27a787fef0ad58c1ae465d8ba1ee4634083ae.zip
mapping: no longer requires specimen file for genbank output
-rw-r--r--workflows/pull-data/genbank/README.md10
-rwxr-xr-xworkflows/tools/normalize-yamlfa.py4
-rw-r--r--workflows/tools/normalize/mapping.py30
3 files changed, 29 insertions, 15 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md
index d7cc15f..5464d1d 100644
--- a/workflows/pull-data/genbank/README.md
+++ b/workflows/pull-data/genbank/README.md
@@ -13,13 +13,17 @@ The following workflow sends GenBank data into PubSeq
# --- get list of IDs already in PubSeq
../../tools/sparql-fetch-ids > pubseq_ids.txt
# --- get list of missing genbank IDs
-./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
+python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt
+
# --- fetch XML
python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank
+
# --- Transform to YAML/JSON and FASTA
python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s)
-# --- Normalize data
-../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s)
+
+# --- Normalize data (validation mode)
+python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate
+
```
# TODO
diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py
index e3f92c0..20c2feb 100755
--- a/workflows/tools/normalize-yamlfa.py
+++ b/workflows/tools/normalize-yamlfa.py
@@ -3,7 +3,7 @@
#
# Example:
#
-# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv
+# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate
import argparse
import json
@@ -30,7 +30,7 @@ won't stop - it is used for (automated) uploads.
parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True)
parser.add_argument('--species', type=str, help='Species mapping file')
-parser.add_argument('--specimen', type=str, help='Specimen mapping file')
+parser.add_argument('--specimen', type=str, help='Optional specimen mapping file')
parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning')
parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files')
parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON')
diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py
index d2af3b5..bc82fea 100644
--- a/workflows/tools/normalize/mapping.py
+++ b/workflows/tools/normalize/mapping.py
@@ -27,21 +27,30 @@ def host_species(host,mapping):
warning = f"No URI mapping for host_species <{key}>"
return host.__dict__,warning
-Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
-Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
-Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
-Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+Unknown = "Not found" # So as not to create a warning
def specimen_source(sample,mapping):
- SPECIMEN_TERMS = {
- r".*swab": Nasal_Swab,
+ Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835"
+ Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835"
+ Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831"
+ Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195"
+ Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275"
+ Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119"
+ Frozen_Food = "https://www.wikidata.org/wiki/Q751728"
+ SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed
+ "Oronasopharynx": Oronasopharynx,
+ "orophar": Oropharyngeal,
+ "pharyngeal": Nasopharyngeal,
+ "\snares": Nasal_Swab,
"saliva": Saliva,
+ "swab": Nasal_Swab,
"seafood": Frozen_Food,
- "packaging": Frozen_Food
+ "packaging": Frozen_Food,
+ "uknown": Unknown,
+ "unknown": Unknown
}
warning = None
sample = types.SimpleNamespace(**sample)
-
try:
if sample.specimen_source and \
not 'obolibrary' in sample.specimen_source and \
@@ -52,12 +61,13 @@ def specimen_source(sample,mapping):
sample.specimen_source = mapping[key]
else:
for term in SPECIMEN_TERMS:
- p = re.compile(term,re.IGNORECASE)
+ p = re.compile(".*?"+term,re.IGNORECASE)
m = p.match(key)
if m: sample.specimen_source = SPECIMEN_TERMS[term]
if not sample.specimen_source:
warning = f"No URI mapping for specimen_source <{key}>"
- if sample.specimen_source == None: del(sample.specimen_source)
+ if sample.specimen_source == Unknown or sample.specimen_source == None:
+ del(sample.specimen_source)
except AttributeError:
pass
return sample.__dict__,warning