From a10c61d1f5afca70f54b762beefb402372d7fd31 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 3 Jan 2021 09:48:14 +0000
Subject: genbank: technology parsing

---
 workflows/pull-data/genbank/genbank.py | 23 ++++++++++++++++++++++-
 workflows/pull-data/genbank/ref.py     | 18 ------------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 8f6ba06..314f50d 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -55,12 +55,15 @@ Example of an output JSON:
 Note: missing data should be None! Do not fill in other data by
 'guessing'.
 
+When data is malformed an warning should be issued.
+
 """
 
 def get_metadata(id, gbseq):
     host = types.SimpleNamespace()
     sample = types.SimpleNamespace()
     submitter = types.SimpleNamespace()
+    technology = types.SimpleNamespace()
     warnings = []
 
     def warn(msg):
@@ -95,6 +98,24 @@ def get_metadata(id, gbseq):
         submitter.additional_submitter_information = n
         pass
 
+    try:
+        n = gbseq.find("./GBSeq_comment").text
+    except AttributeError:
+        pass
+    if 'Assembly-Data' in n:
+        # print(n,file=sys.stderr)
+        # the following is wrong (de novo by default)
+        # technology.assembly_method = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
+        p = re.compile(r'.*Assembly Method :: ([^;]+).*')
+        m = p.match(n)
+        if m: technology.alignment_protocol = m.group(1)
+        p = re.compile(r'.*Coverage :: ([^;]+).*')
+        m = p.match(n)
+        if m: technology.sequencing_coverage = m.group(1)
+        p = re.compile(r'.*Sequencing Technology :: ([^;]+).*')
+        m = p.match(n)
+        if m: technology.sample_sequencing_technology = m.group(1).strip()
+
     # --- Dates
     n = gbseq.find("./GBSeq_create-date")
     creation_date = dateparse(n.text).date()
@@ -117,7 +138,7 @@ def get_metadata(id, gbseq):
         'host': host,
         'sample': sample,
         #'virus': virus,
-        #'technology': technology,
+        'technology': technology,
         'submitter': submitter,
         'warnings': warnings,
         }
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index 66c9fb0..d809d7f 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -1,23 +1,5 @@
 # ---- BELOW IS JUST FOR REFERENCE ----
 
-            # This script download and prepare data and metadata for assemblies samples
-            technology['assembly_method'] = 'http://purl.obolibrary.org/obo/GENEPIO_0001628'
-
-            GBSeq_comment = GBSeq.find('GBSeq_comment')
-            if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text:
-                prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly'
-
-                GBSeq_comment_text = GBSeq_comment.text.split(
-                    '{}-Data-START## ; '.format(prefix_split_string)
-                )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0]
-
-                for info_to_check, field_in_yaml in zip(
-                    ['Assembly Method', 'Coverage', 'Sequencing Technology'],
-                    ['alignment_protocol', 'sequencing_coverage', 'sample_sequencing_technology']
-                ):
-                    if info_to_check in GBSeq_comment_text:
-                        tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
-
                         if field_in_yaml == 'sequencing_coverage':
                             # A regular expression would be better!
                             try:
-- 
cgit v1.2.3