diff options
-rw-r--r-- | workflows/pull-data/genbank/README.md | 10 | ||||
-rwxr-xr-x | workflows/tools/normalize-yamlfa.py | 4 | ||||
-rw-r--r-- | workflows/tools/normalize/mapping.py | 30 |
3 files changed, 29 insertions, 15 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index d7cc15f..5464d1d 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -13,13 +13,17 @@ The following workflow sends GenBank data into PubSeq # --- get list of IDs already in PubSeq ../../tools/sparql-fetch-ids > pubseq_ids.txt # --- get list of missing genbank IDs -./genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt +python3 genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt + # --- fetch XML python3 update-from-genbank.py --ids genbank_ids.txt --out ~/tmp/genbank + # --- Transform to YAML/JSON and FASTA python3 transform-genbank-xml2yamlfa.py --out ~/tmp/pubseq file(s) -# --- Normalize data -../../tools/normalize-yamlfa.py --in ~/tmp/pubseq/state.json file(s) + +# --- Normalize data (validation mode) +python3 ../../workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate + ``` # TODO diff --git a/workflows/tools/normalize-yamlfa.py b/workflows/tools/normalize-yamlfa.py index e3f92c0..20c2feb 100755 --- a/workflows/tools/normalize-yamlfa.py +++ b/workflows/tools/normalize-yamlfa.py @@ -3,7 +3,7 @@ # # Example: # -# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json MW241349 --species ./scripts/dict_ontology_standardization/ncbi_host_species.csv +# python3 ./workflows/tools/normalize-yamlfa.py -s ~/tmp/yamlfa/state.json --species ncbi_host_species.csv --specimen specimen.csv --validate import argparse import json @@ -30,7 +30,7 @@ won't stop - it is used for (automated) uploads. parser.add_argument('-s','--state', type=str, help='State file (JSON) as produced by transform2yamlfa', required=True) parser.add_argument('--species', type=str, help='Species mapping file') -parser.add_argument('--specimen', type=str, help='Specimen mapping file') +parser.add_argument('--specimen', type=str, help='Optional specimen mapping file') parser.add_argument('--validate', action='store_true', help='Validation mode - stops on warning') parser.add_argument('--rewrite', action='store_true', help='Rewrite mode - updates files') parser.add_argument('--yaml', action='store_true', help='Input YAML instead of JSON') diff --git a/workflows/tools/normalize/mapping.py b/workflows/tools/normalize/mapping.py index d2af3b5..bc82fea 100644 --- a/workflows/tools/normalize/mapping.py +++ b/workflows/tools/normalize/mapping.py @@ -27,21 +27,30 @@ def host_species(host,mapping): warning = f"No URI mapping for host_species <{key}>" return host.__dict__,warning -Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" -Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" -Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119" -Frozen_Food = "https://www.wikidata.org/wiki/Q751728" +Unknown = "Not found" # So as not to create a warning def specimen_source(sample,mapping): - SPECIMEN_TERMS = { - r".*swab": Nasal_Swab, + Oronasopharynx = "http://purl.obolibrary.org/obo/NCIT_C155835" + Oropharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155835" + Nasopharyngeal = "http://purl.obolibrary.org/obo/NCIT_C155831" + Bronchoalveolar_Lavage_Fluid = "http://purl.obolibrary.org/obo/NCIT_C13195" + Saliva = "http://purl.obolibrary.org/obo/NCIT_C13275" + Nasal_Swab = "http://purl.obolibrary.org/obo/NCIT_C132119" + Frozen_Food = "https://www.wikidata.org/wiki/Q751728" + SPECIMEN_TERMS = { # since Python 3.7 dict is ordered! Note that re is allowed + "Oronasopharynx": Oronasopharynx, + "orophar": Oropharyngeal, + "pharyngeal": Nasopharyngeal, + "\snares": Nasal_Swab, "saliva": Saliva, + "swab": Nasal_Swab, "seafood": Frozen_Food, - "packaging": Frozen_Food + "packaging": Frozen_Food, + "uknown": Unknown, + "unknown": Unknown } warning = None sample = types.SimpleNamespace(**sample) - try: if sample.specimen_source and \ not 'obolibrary' in sample.specimen_source and \ @@ -52,12 +61,13 @@ def specimen_source(sample,mapping): sample.specimen_source = mapping[key] else: for term in SPECIMEN_TERMS: - p = re.compile(term,re.IGNORECASE) + p = re.compile(".*?"+term,re.IGNORECASE) m = p.match(key) if m: sample.specimen_source = SPECIMEN_TERMS[term] if not sample.specimen_source: warning = f"No URI mapping for specimen_source <{key}>" - if sample.specimen_source == None: del(sample.specimen_source) + if sample.specimen_source == Unknown or sample.specimen_source == None: + del(sample.specimen_source) except AttributeError: pass return sample.__dict__,warning |