From d55a1b6556e6cd6e09405cb1f4bcf58d52892331 Mon Sep 17 00:00:00 2001
From: Pjotr Prins
Date: Sun, 3 Jan 2021 07:40:34 +0000
Subject: genbank: get authors

---
 workflows/pull-data/genbank/genbank.py | 24 +++++++++++------------
 workflows/pull-data/genbank/ref.py     | 35 ----------------------------------
 2 files changed, 11 insertions(+), 48 deletions(-)
diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py
index 5ba1ef6..2d46f3d 100644
--- a/workflows/pull-data/genbank/genbank.py
+++ b/workflows/pull-data/genbank/genbank.py
@@ -51,6 +51,10 @@ Example of an output JSON:
     "submitter_address": "Pirogov Russian National Research Medical University, Ostrovityanova 1, Moscow 117997, Russia"
   }
 }
+
+Note: missing data should be None! Do not fill in other data by
+'guessing'.
+
 """
 
 def get_metadata(id, gbseq):
@@ -73,29 +77,23 @@ def get_metadata(id, gbseq):
     # </GBQualifier>
     sample.collection_location = "FIXME"
 
-    # --- Handling dates ---
-    # <GBSeq_create-date>29-JUL-2020</GBSeq_create-date>
+    submitter.authors = [n.text for n in gbseq.findall(".//GBAuthor")]
+
+    # --- Dates
     n = gbseq.find("./GBSeq_create-date")
     creation_date = dateparse(n.text).date()
-
-    # <GBSeq_update-date>30-JUL-2020</GBSeq_update-date>
     n = gbseq.find("./GBSeq_update-date")
     update_date = dateparse(n.text).date()
-
-    # <GBQualifier>
-    #   <GBQualifier_name>collection_date</GBQualifier_name>
-    #   <GBQualifier_value>2020-04-01</GBQualifier_value>
-    # </GBQualifier>
     n = gbseq.find(".//GBQualifier/GBQualifier_name/[.='collection_date']/../GBQualifier_value")
     try:
         date = dateparse(n.text).date()
         sample.collection_date = str(date)
     except dateutil.parser._parser.ParserError as e:
-        warn(str(e))
-        sample.collection_date = str(creation_date)
+        warn("No collection_date: ",str(e))
+        sample.collection_date = None
     except AttributeError:
-        warn("Missing collection_date - used creation_date instead")
-        sample.collection_date = str(creation_date)
+        warn("Missing collection_date")
+        sample.collection_date = None
 
     info = {
         'id': 'placeholder',
diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py
index f803112..e998d37 100644
--- a/workflows/pull-data/genbank/ref.py
+++ b/workflows/pull-data/genbank/ref.py
@@ -1,40 +1,5 @@
 # ---- BELOW IS JUST FOR REFERENCE ----
 
-
-min_len_to_count = 15000
-num_seq_with_len_ge_X_bp = 0
-
-missing_value_list = []
-not_created_accession_dict = {}
-accession_with_errors_list = []
-if None:
-
-    tree = ET.parse(path_metadata_xxx_xml)
-    GBSet = tree.getroot()
-
-    for GBSeq in GBSet:
-        accession_version = GBSeq.find('GBSeq_accession-version').text
-
-        try:
-            info = {
-                'id': 'placeholder',
-                'host': {},
-                'sample': {},
-                'virus': {},
-                'technology': {},
-                'submitter': {}
-            }
-
-            sample['sample_id'] = accession_version
-            sample['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now
-
-            # submitter info
-            GBSeq_references = GBSeq.find('GBSeq_references')
-            if GBSeq_references is not None:
-                author_list = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')]
-                if len(author_list) > 0:
-                    submitter['authors'] = author_list
-
                 GBReference = GBSeq_references.find('GBReference')
                 if GBReference is not None:
                     GBReference_journal = GBReference.find('GBReference_journal')
-- 
cgit v1.2.3