diff options
author | Pjotr Prins | 2021-01-03 09:48:14 +0000 |
---|---|---|
committer | Pjotr Prins | 2021-01-03 09:48:14 +0000 |
commit | a10c61d1f5afca70f54b762beefb402372d7fd31 (patch) | |
tree | 40824a1aad7c532979c9dada50163d745d0e3373 /workflows/pull-data/genbank/genbank.py | |
parent | a5ba1a8062e7116c2951762f86a6ae6d1638261d (diff) | |
download | bh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.tar.gz bh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.tar.lz bh20-seq-resource-a10c61d1f5afca70f54b762beefb402372d7fd31.zip |
genbank: technology parsing
Diffstat (limited to 'workflows/pull-data/genbank/genbank.py')
-rw-r--r-- | workflows/pull-data/genbank/genbank.py | 23 |
1 files changed, 22 insertions, 1 deletions
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 8f6ba06..314f50d 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -55,12 +55,15 @@ Example of an output JSON: Note: missing data should be None! Do not fill in other data by 'guessing'. +When data is malformed an warning should be issued. + """ def get_metadata(id, gbseq): host = types.SimpleNamespace() sample = types.SimpleNamespace() submitter = types.SimpleNamespace() + technology = types.SimpleNamespace() warnings = [] def warn(msg): @@ -95,6 +98,24 @@ def get_metadata(id, gbseq): submitter.additional_submitter_information = n pass + try: + n = gbseq.find("./GBSeq_comment").text + except AttributeError: + pass + if 'Assembly-Data' in n: + # print(n,file=sys.stderr) + # the following is wrong (de novo by default) + # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628' + p = re.compile(r'.*Assembly Method :: ([^;]+).*') + m = p.match(n) + if m: technology.alignment_protocol = m.group(1) + p = re.compile(r'.*Coverage :: ([^;]+).*') + m = p.match(n) + if m: technology.sequencing_coverage = m.group(1) + p = re.compile(r'.*Sequencing Technology :: ([^;]+).*') + m = p.match(n) + if m: technology.sample_sequencing_technology = m.group(1).strip() + # --- Dates n = gbseq.find("./GBSeq_create-date") creation_date = dateparse(n.text).date() @@ -117,7 +138,7 @@ def get_metadata(id, gbseq): 'host': host, 'sample': sample, #'virus': virus, - #'technology': technology, + 'technology': technology, 'submitter': submitter, 'warnings': warnings, } |