From a10c61d1f5afca70f54b762beefb402372d7fd31 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 3 Jan 2021 09:48:14 +0000 Subject: genbank: technology parsing --- workflows/pull-data/genbank/genbank.py | 23 ++++++++++++++++++++++- workflows/pull-data/genbank/ref.py | 18 ------------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 8f6ba06..314f50d 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -55,12 +55,15 @@ Example of an output JSON: Note: missing data should be None! Do not fill in other data by 'guessing'. +When data is malformed an warning should be issued. + """ def get_metadata(id, gbseq): host = types.SimpleNamespace() sample = types.SimpleNamespace() submitter = types.SimpleNamespace() + technology = types.SimpleNamespace() warnings = [] def warn(msg): @@ -95,6 +98,24 @@ def get_metadata(id, gbseq): submitter.additional_submitter_information = n pass + try: + n = gbseq.find("./GBSeq_comment").text + except AttributeError: + pass + if 'Assembly-Data' in n: + # print(n,file=sys.stderr) + # the following is wrong (de novo by default) + # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628' + p = re.compile(r'.*Assembly Method :: ([^;]+).*') + m = p.match(n) + if m: technology.alignment_protocol = m.group(1) + p = re.compile(r'.*Coverage :: ([^;]+).*') + m = p.match(n) + if m: technology.sequencing_coverage = m.group(1) + p = re.compile(r'.*Sequencing Technology :: ([^;]+).*') + m = p.match(n) + if m: technology.sample_sequencing_technology = m.group(1).strip() + # --- Dates n = gbseq.find("./GBSeq_create-date") creation_date = dateparse(n.text).date() @@ -117,7 +138,7 @@ def get_metadata(id, gbseq): 'host': host, 'sample': sample, #'virus': virus, - #'technology': technology, + 'technology': technology, 'submitter': submitter, 'warnings': warnings, } diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py index 66c9fb0..d809d7f 100644 --- a/workflows/pull-data/genbank/ref.py +++ b/workflows/pull-data/genbank/ref.py @@ -1,23 +1,5 @@ # ---- BELOW IS JUST FOR REFERENCE ---- - # This script download and prepare data and metadata for assemblies samples - technology['assembly_method'] = 'http://purl.obolibrary.org/obo/GENEPIO_0001628' - - GBSeq_comment = GBSeq.find('GBSeq_comment') - if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text: - prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly' - - GBSeq_comment_text = GBSeq_comment.text.split( - '{}-Data-START## ; '.format(prefix_split_string) - )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0] - - for info_to_check, field_in_yaml in zip( - ['Assembly Method', 'Coverage', 'Sequencing Technology'], - ['alignment_protocol', 'sequencing_coverage', 'sample_sequencing_technology'] - ): - if info_to_check in GBSeq_comment_text: - tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0] - if field_in_yaml == 'sequencing_coverage': # A regular expression would be better! try: -- cgit v1.2.3