From d55a1b6556e6cd6e09405cb1f4bcf58d52892331 Mon Sep 17 00:00:00 2001 From: Pjotr Prins Date: Sun, 3 Jan 2021 07:40:34 +0000 Subject: genbank: get authors --- workflows/pull-data/genbank/genbank.py | 24 +++++++++++------------ workflows/pull-data/genbank/ref.py | 35 ---------------------------------- 2 files changed, 11 insertions(+), 48 deletions(-) diff --git a/workflows/pull-data/genbank/genbank.py b/workflows/pull-data/genbank/genbank.py index 5ba1ef6..2d46f3d 100644 --- a/workflows/pull-data/genbank/genbank.py +++ b/workflows/pull-data/genbank/genbank.py @@ -51,6 +51,10 @@ Example of an output JSON: "submitter_address": "Pirogov Russian National Research Medical University, Ostrovityanova 1, Moscow 117997, Russia" } } + +Note: missing data should be None! Do not fill in other data by +'guessing'. + """ def get_metadata(id, gbseq): @@ -73,29 +77,23 @@ def get_metadata(id, gbseq): # sample.collection_location = "FIXME" - # --- Handling dates --- - # 29-JUL-2020 + submitter.authors = [n.text for n in gbseq.findall(".//GBAuthor")] + + # --- Dates n = gbseq.find("./GBSeq_create-date") creation_date = dateparse(n.text).date() - - # 30-JUL-2020 n = gbseq.find("./GBSeq_update-date") update_date = dateparse(n.text).date() - - # - # collection_date - # 2020-04-01 - # n = gbseq.find(".//GBQualifier/GBQualifier_name/[.='collection_date']/../GBQualifier_value") try: date = dateparse(n.text).date() sample.collection_date = str(date) except dateutil.parser._parser.ParserError as e: - warn(str(e)) - sample.collection_date = str(creation_date) + warn("No collection_date: ",str(e)) + sample.collection_date = None except AttributeError: - warn("Missing collection_date - used creation_date instead") - sample.collection_date = str(creation_date) + warn("Missing collection_date") + sample.collection_date = None info = { 'id': 'placeholder', diff --git a/workflows/pull-data/genbank/ref.py b/workflows/pull-data/genbank/ref.py index f803112..e998d37 100644 --- a/workflows/pull-data/genbank/ref.py +++ b/workflows/pull-data/genbank/ref.py @@ -1,40 +1,5 @@ # ---- BELOW IS JUST FOR REFERENCE ---- - -min_len_to_count = 15000 -num_seq_with_len_ge_X_bp = 0 - -missing_value_list = [] -not_created_accession_dict = {} -accession_with_errors_list = [] -if None: - - tree = ET.parse(path_metadata_xxx_xml) - GBSet = tree.getroot() - - for GBSeq in GBSet: - accession_version = GBSeq.find('GBSeq_accession-version').text - - try: - info = { - 'id': 'placeholder', - 'host': {}, - 'sample': {}, - 'virus': {}, - 'technology': {}, - 'submitter': {} - } - - sample['sample_id'] = accession_version - sample['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now - - # submitter info - GBSeq_references = GBSeq.find('GBSeq_references') - if GBSeq_references is not None: - author_list = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')] - if len(author_list) > 0: - submitter['authors'] = author_list - GBReference = GBSeq_references.find('GBReference') if GBReference is not None: GBReference_journal = GBReference.find('GBReference_journal') -- cgit v1.2.3