From 21ac66de2f92b7a433c6677d7526ee8d9639d999 Mon Sep 17 00:00:00 2001
From: Andrea Guarracino
Date: Mon, 22 Jun 2020 22:41:04 +0200
Subject: little fix for specimen_source

---
 scripts/from_genbank_to_fasta_and_yaml.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
index 87e99d4..6216340 100755
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ b/scripts/from_genbank_to_fasta_and_yaml.py
@@ -305,12 +305,14 @@ for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml)
                         if GBQualifier_value_text in term_to_uri_dict:
                             info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
                         else:
-                            if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab']:
+                            if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
                                 info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
-                            elif GBQualifier_value_text in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab']:
+                            elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
                                 info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
-                            elif GBQualifier_value_text in ['nasopharyngeal aspirate/throat swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal aspirate'], term_to_uri_dict['throat swab']]
+                            elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
+                            elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
                             else:
                                 missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
                     elif GBQualifier_name_text == 'collection_date':
-- 
cgit 1.4.1


From 5bd991fe5f048b2c4b2c23c6149c1bd789d3bdbc Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Mon, 22 Jun 2020 23:23:03 +0200
Subject: added new dictionary entries

---
 scripts/dict_ontology_standardization/ncbi_countries.csv           | 4 ++++
 scripts/dict_ontology_standardization/ncbi_host_species.csv        | 1 +
 .../dict_ontology_standardization/ncbi_sequencing_technology.csv   | 7 +++++++
 3 files changed, 12 insertions(+)

diff --git a/scripts/dict_ontology_standardization/ncbi_countries.csv b/scripts/dict_ontology_standardization/ncbi_countries.csv
index 85d4e8a..204f7f2 100644
--- a/scripts/dict_ontology_standardization/ncbi_countries.csv
+++ b/scripts/dict_ontology_standardization/ncbi_countries.csv
@@ -239,6 +239,7 @@ Solomon Islands,http://www.wikidata.org/entity/Q685
 Somalia,http://www.wikidata.org/entity/Q1045
 South Africa,http://www.wikidata.org/entity/Q258
 South Africa: KwaZulu-Natal,http://www.wikidata.org/entity/Q81725
+South Africa: KZN,http://www.wikidata.org/entity/Q81725
 South African Republic,http://www.wikidata.org/entity/Q550374
 South Korea,http://www.wikidata.org/entity/Q884
 South Sudan,http://www.wikidata.org/entity/Q958
@@ -279,6 +280,7 @@ USA: AR,http://www.wikidata.org/entity/Q1612
 USA: AZ,http://www.wikidata.org/entity/Q816
 USA: Arizona,http://www.wikidata.org/entity/Q816
 USA: CA,http://www.wikidata.org/entity/Q99
+USA:CA,http://www.wikidata.org/entity/Q99
 USA: California,http://www.wikidata.org/entity/Q99
 USA:California,http://www.wikidata.org/entity/Q99
 "USA: CA, San Diego County",http://www.wikidata.org/entity/Q108143
@@ -299,6 +301,7 @@ USA: IN,http://www.wikidata.org/entity/Q1415
 USA: KS,http://www.wikidata.org/entity/Q1558
 USA: KY,http://www.wikidata.org/entity/Q1603
 USA: LA,http://www.wikidata.org/entity/Q1588
+USA:Los Angeles,http://www.wikidata.org/entity/Q65
 "USA: New Orleans, LA",http://www.wikidata.org/entity/Q34404
 USA: MA,http://www.wikidata.org/entity/Q771
 USA: Massachusetts,http://www.wikidata.org/entity/Q771
@@ -330,6 +333,7 @@ USA: PA,http://www.wikidata.org/entity/Q1400
 USA: RI,http://www.wikidata.org/entity/Q1387
 "USA: San Francisco, CA",http://www.wikidata.org/entity/Q62
 USA: SC,http://www.wikidata.org/entity/Q1456
+USA: South Carolina,http://www.wikidata.org/entity/Q1456
 USA: SD,http://www.wikidata.org/entity/Q1211
 "USA: Snohomish County, WA",http://www.wikidata.org/entity/Q110403
 USA: TN,http://www.wikidata.org/entity/Q1509
diff --git a/scripts/dict_ontology_standardization/ncbi_host_species.csv b/scripts/dict_ontology_standardization/ncbi_host_species.csv
index 102d458..bc6ac04 100644
--- a/scripts/dict_ontology_standardization/ncbi_host_species.csv
+++ b/scripts/dict_ontology_standardization/ncbi_host_species.csv
@@ -1,6 +1,7 @@
 Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 human,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
+sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
 Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
 Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
 Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
diff --git a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
index 110e90b..964cbf3 100644
--- a/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
+++ b/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv
@@ -1,3 +1,9 @@
+Illumina HiSeq 1000,http://www.ebi.ac.uk/efo/EFO_0004204
+Illumina HiSeq 2000,http://www.ebi.ac.uk/efo/EFO_0004203
+Illumina HiSeq 2500,http://www.ebi.ac.uk/efo/EFO_0008565
+Illumina HiSeq 3000,http://www.ebi.ac.uk/efo/EFO_0008564
+Illumina HiSeq 4000,http://www.ebi.ac.uk/efo/EFO_0008563
+Illumina iSeq 100,http://www.ebi.ac.uk/efo/EFO_0008635
 Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
 Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
 NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173
@@ -14,6 +20,7 @@ ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818
 Oxford Nanopore Technology,http://purl.obolibrary.org/obo/NCIT_C146818
 Oxford Nanopore technologies MinION,http://www.ebi.ac.uk/efo/EFO_0008632
 MinION Oxford Nanopore,http://www.ebi.ac.uk/efo/EFO_0008632
+MinION,http://www.ebi.ac.uk/efo/EFO_0008632
 Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818
 Illumina MiSeq,http://www.ebi.ac.uk/efo/EFO_0004205
 Illumina,http://purl.obolibrary.org/obo/OBI_0000759
-- 
cgit 1.4.1


From 75ad6744aa8edb7de92a427c55405d9331879ad4 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Mon, 22 Jun 2020 23:27:19 +0200
Subject: moved the genbank script in his specific directory

---
 .../from_genbank_to_fasta_and_yaml.py              |  395 +++++
 scripts/from_genbank_to_fasta_and_yaml.py          |  395 -----
 scripts/sequences.acc                              | 1731 --------------------
 3 files changed, 395 insertions(+), 2126 deletions(-)
 create mode 100755 scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
 delete mode 100755 scripts/from_genbank_to_fasta_and_yaml.py
 delete mode 100644 scripts/sequences.acc

diff --git a/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
new file mode 100755
index 0000000..d76f56b
--- /dev/null
+++ b/scripts/download_genbank_data/from_genbank_to_fasta_and_yaml.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
+parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False)
+args = parser.parse_args()
+
+from Bio import Entrez
+Entrez.email = 'another_email@gmail.com'
+
+import xml.etree.ElementTree as ET
+import json
+import os
+import requests
+import sys
+
+from datetime import date
+from dateutil.parser import parse
+
+num_ids_for_request = 100
+
+dir_metadata = 'metadata_from_nuccore'
+dir_fasta_and_yaml = 'fasta_and_yaml'
+dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+
+today_date = date.today().strftime("%Y.%m.%d")
+path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date)
+
+def is_integer(string_to_check):
+    try:
+        int(string_to_check)
+        return True
+    except ValueError:
+        return False
+
+def chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+if os.path.exists(dir_metadata):
+    print("The directory '{}' already exists.".format(dir_metadata))
+
+    if not args.skip_request:
+        print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata))
+        sys.exit(-1)
+
+
+accession_already_downloaded_set = []
+
+if os.path.exists(dir_fasta_and_yaml):
+    print("The directory '{}' already exists.".format(dir_fasta_and_yaml))
+    if not args.only_missing_id:
+        print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml))
+        sys.exit(-1)
+
+    accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')])
+    print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set)))
+
+
+if not os.path.exists(dir_metadata):
+    # Take all the ids
+    id_set = set()
+
+    # Try to search several strings
+    term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
+    for term in term_list:
+        tmp_list = Entrez.read(
+            Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000')
+        )['IdList']
+
+        # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq)
+        tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]
+
+        # Remove the version in the id
+        tmp_list = [x.split('.')[0] for x in tmp_list]
+
+        #tmp_list = tmp_list[0:2] # restricting to small run
+        new_ids_set = set([x.split('.')[0] for x in tmp_list])
+        new_ids = len(new_ids_set.difference(id_set))
+        id_set.update(new_ids_set)
+
+        print('Term:', term, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set))
+
+    if not os.path.exists(path_ncbi_virus_accession):
+        r = requests.get('https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?q=*:*&fq=%7B!tag=SeqType_s%7DSeqType_s:(%22Nucleotide%22)&fq=VirusLineageId_ss:(2697049)&cmd=download&sort=SourceDB_s%20desc,CreateDate_dt%20desc,id%20asc&dlfmt=acc&fl=id')
+        with open(path_ncbi_virus_accession, 'w') as fw:
+            fw.write(r.text)
+
+    with open(path_ncbi_virus_accession) as f:
+        tmp_list = [line.strip('\n') for line in f]
+
+    new_ids = len(set(tmp_list).difference(id_set))
+    id_set.update(tmp_list)
+
+    print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set))
+
+    if len(accession_already_downloaded_set) > 0:
+        id_set = id_set.difference(accession_already_downloaded_set)
+        print('There are {} missing IDs to download.'.format(len(id_set)))
+
+    os.makedirs(dir_metadata)
+    for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
+        path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i))
+        print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml))
+
+        with open(path_metadata_xxx_xml, 'w') as fw:
+            fw.write(
+                Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read()
+            )
+
+
+term_to_uri_dict = {}
+
+for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
+    print('Read {}'.format(path_dict_xxx_csv))
+
+    with open(path_dict_xxx_csv) as f:
+        for line in f:
+            if len(line.split(',')) > 2:
+                term, uri = line.strip('\n').split('",')
+                term = term.strip('"')
+            else:
+                term, uri = line.strip('\n').split(',')
+
+            if term in term_to_uri_dict:
+                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                continue
+                
+            term_to_uri_dict[term] = uri
+
+if not os.path.exists(dir_fasta_and_yaml):
+    os.makedirs(dir_fasta_and_yaml)
+
+min_len_to_count = 27500
+num_seq_with_len_ge_X_bp = 0
+
+missing_value_list = []
+accession_with_errors_list = []
+
+for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
+    tree = ET.parse(path_metadata_xxx_xml)
+    GBSet = tree.getroot()
+
+    for GBSeq in GBSet:
+        accession_version = GBSeq.find('GBSeq_accession-version').text
+
+        GBSeq_sequence = GBSeq.find('GBSeq_sequence')
+        if GBSeq_sequence is None:
+            print(accession_version, ' - sequence not found')
+            continue
+
+        try:
+            #print(path_metadata_xxx_xml, accession_version)
+
+            # A general default-empty yaml could be read from the definitive one
+            info_for_yaml_dict = {
+                'id': 'placeholder',
+                'host': {},
+                'sample': {},
+                'virus': {},
+                'technology': {},
+                'submitter': {}
+            }
+
+
+            info_for_yaml_dict['sample']['sample_id'] = accession_version
+            info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now
+
+
+            # submitter info
+            GBSeq_references = GBSeq.find('GBSeq_references')
+            if GBSeq_references is not None:
+                info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')]
+
+                GBReference = GBSeq_references.find('GBReference')
+                if GBReference is not None:
+                    GBReference_journal = GBReference.find('GBReference_journal')
+
+                    if GBReference_journal is not None and GBReference_journal.text != 'Unpublished':
+                        if 'Submitted' in GBReference_journal.text:
+                            info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())]
+                            info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip()
+                        else:
+                            info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text
+
+
+            GBSeq_comment = GBSeq.find('GBSeq_comment')
+            if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text:
+                prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly'
+
+                GBSeq_comment_text = GBSeq_comment.text.split(
+                    '{}-Data-START## ; '.format(prefix_split_string)
+                )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0]
+
+                for info_to_check, field_in_yaml in zip(
+                    ['Assembly Method', 'Coverage', 'Sequencing Technology'],
+                    ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology']
+                ):
+                    if info_to_check in GBSeq_comment_text:
+                        tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
+
+                        if field_in_yaml == 'sequencing_coverage':
+                            # A regular expression would be better!
+                            try:
+                                info_for_yaml_dict['technology'][field_in_yaml] = [
+                                    float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>'))
+                                ]
+                            except ValueError:
+                                print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
+                                pass
+                        elif field_in_yaml == 'sample_sequencing_technology':
+                            new_seq_tec_list = []
+                            for seq_tec in tech_info_to_parse.split(';'):
+                                seq_tec = seq_tec.strip()
+                                if seq_tec in term_to_uri_dict:
+                                    seq_tec = term_to_uri_dict[seq_tec]
+                                else:
+                                    missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec]))
+
+                                new_seq_tec_list.append(seq_tec)
+
+                            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list]
+                        else:
+                            info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
+
+
+            for GBFeature in GBSeq.iter('GBFeature'):
+                if GBFeature.find('GBFeature_key').text != 'source':
+                    continue
+
+                for GBQualifier in GBFeature.iter('GBQualifier'):
+                    GBQualifier_value = GBQualifier.find('GBQualifier_value')
+                    if GBQualifier_value is None:
+                        continue
+                    GBQualifier_value_text = GBQualifier_value.text
+
+                    GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
+
+                    if GBQualifier_name_text == 'host':
+                        GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
+
+                        if GBQualifier_value_text_list[0] in term_to_uri_dict:
+                            info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]]
+                        elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version):
+                            # Information checked manually from NCBI Virus
+                            info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris']
+                        else:
+                            missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
+
+                        # Possible cases:
+                        # - Homo sapiens						--> ['Homo sapiens']
+                        # - Homo sapiens; female				--> ['Homo sapiens', 'female']
+                        # - Homo sapiens; female 63				--> ['Homo sapiens', 'female 63']
+                        # - Homo sapiens; female; age 40		--> ['Homo sapiens', 'female', 'age 40']
+                        # - Homo sapiens; gender: F; age: 61	--> ['Homo sapiens', 'gender: F', 'age: 61']
+                        # - Homo sapiens; gender: M; age: 68	--> ['Homo sapiens', 'gender: M', 'age: 68']
+                        # - Homo sapiens; hospitalized patient	--> ['Homo sapiens', 'hospitalized patient']
+                        # - Homo sapiens; male					--> ['Homo sapiens', 'male']
+                        # - Homo sapiens; male; 63				--> ['Homo sapiens', 'male', '63']
+                        # - Homo sapiens; male; age 29			--> ['Homo sapiens', 'male', 'age 29']
+                        # - Homo sapiens; symptomatic			--> ['Homo sapiens', 'symptomatic']
+                        if len(GBQualifier_value_text_list) > 1:
+                            host_sex = ''
+                            if 'female' in GBQualifier_value_text_list[1]:
+                                host_sex = 'female'
+                            elif 'male' in GBQualifier_value_text_list[1]:
+                                host_sex = 'male'
+                            elif 'gender' in GBQualifier_value_text_list[1]:
+                                host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip()
+                                if host_sex_one_lecter in ['F', 'M']:
+                                    host_sex = 'female' if host_sex_one_lecter == 'F' else 'male'
+
+                            if host_sex in ['male', 'female']:
+                                info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
+                            elif GBQualifier_value_text_list[1] in term_to_uri_dict:
+                                info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]                            
+                            else:
+                                missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]]))
+
+                            # Host age
+                            host_age = -1
+                            if len(GBQualifier_value_text_list[1].split(' ')) > 1 and is_integer(GBQualifier_value_text_list[1].split(' ')[-1]):
+                                host_age = int(GBQualifier_value_text_list[1].split(' ')[-1])
+                            elif len(GBQualifier_value_text_list) > 2 and is_integer(GBQualifier_value_text_list[2].split(' ')[-1]):
+                                host_age = int(GBQualifier_value_text_list[2].split(' ')[-1])
+
+                            if host_age > -1:
+                                info_for_yaml_dict['host']['host_age'] = host_age
+                                info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036'
+                            elif len(GBQualifier_value_text_list) > 2:
+                                missing_value_list.append('\t'.join([accession_version, 'host_age', GBQualifier_value_text_list[2]]))
+                    elif GBQualifier_name_text == 'collected_by':
+                        if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]):
+                            info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text
+                        else:
+                            info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text
+                    elif GBQualifier_name_text == 'isolation_source':
+                        if GBQualifier_value_text.upper() in term_to_uri_dict:
+                            GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa'
+
+                        # Little cleaning
+                        GBQualifier_value_text = GBQualifier_value_text.strip("/'")
+
+                        if GBQualifier_value_text in term_to_uri_dict:
+                            info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
+                        else:
+                            if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
+                            elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+                            elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
+                            elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
+                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
+                            else:
+                                missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
+                    elif GBQualifier_name_text == 'collection_date':
+                        # TO_DO: which format we will use?
+                        date_to_write = GBQualifier_value_text
+
+                        if len(GBQualifier_value_text.split('-')) == 1:
+                            if int(GBQualifier_value_text) < 2020:
+                                date_to_write = "{}-12-15".format(GBQualifier_value_text)
+                            else:
+                                date_to_write = "{}-01-15".format(GBQualifier_value_text)
+
+                            if 'additional_collection_information' in info_for_yaml_dict['sample']:
+                                info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+                            else:
+                                info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+                        elif len(GBQualifier_value_text.split('-')) == 2:
+                            date_to_write += '-15'
+
+                            if 'additional_collection_information' in info_for_yaml_dict['sample']:
+                                info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+                            else:
+                                info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
+                        elif len(GBQualifier_value_text.split('-')) == 3:
+                            GBQualifier_value_text_list = GBQualifier_value_text.split('-')
+
+                            if GBQualifier_value_text_list[1].isalpha():
+                                date_to_write = parse(GBQualifier_value_text).strftime('%Y-%m-%d')
+
+                        info_for_yaml_dict['sample']['collection_date'] = date_to_write
+                    elif GBQualifier_name_text in ['lat_lon', 'country']:
+                        if GBQualifier_value_text == 'Hong Kong':
+                            GBQualifier_value_text = 'China: Hong Kong'
+
+                        if GBQualifier_value_text in term_to_uri_dict:
+                            info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text]
+                        else:
+                            missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text]))
+                    elif GBQualifier_name_text == 'note':
+                        if 'additional_collection_information' in info_for_yaml_dict['sample']:
+                            info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text
+                        else:
+                            info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text
+                    elif GBQualifier_name_text == 'isolate':
+                        info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
+                    elif GBQualifier_name_text == 'db_xref':
+                        info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
+
+
+            # Remove technology key if empty!
+            if (info_for_yaml_dict['technology']=={}):
+                del info_for_yaml_dict['technology']
+
+            with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
+                fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
+
+            with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
+                json.dump(info_for_yaml_dict, fw, indent=2)
+
+
+            if(len(GBSeq_sequence.text) >= min_len_to_count):
+                num_seq_with_len_ge_X_bp += 1
+        except:
+            print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0]))
+            accession_with_errors_list.append(accession_version)
+            continue
+
+if len(missing_value_list) > 0:
+    path_missing_terms_tsv = 'missing_terms.tsv'
+    print('Written missing terms in {}'.format(path_missing_terms_tsv))
+    with open(path_missing_terms_tsv, 'w') as fw:
+        fw.write('\n'.join(missing_value_list))
+
+if len(accession_with_errors_list) > 0:
+    path_accession_with_errors_tsv = 'accession_with_errors.tsv'
+    print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
+    with open(path_accession_with_errors_tsv, 'w') as fw:
+        fw.write('\n'.join(accession_with_errors_list))
+        
+print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
diff --git a/scripts/from_genbank_to_fasta_and_yaml.py b/scripts/from_genbank_to_fasta_and_yaml.py
deleted file mode 100755
index 6216340..0000000
--- a/scripts/from_genbank_to_fasta_and_yaml.py
+++ /dev/null
@@ -1,395 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument('--skip-request', action='store_true', help='skip metadata and sequence request', required=False)
-parser.add_argument('--only-missing-id', action='store_true', help='download only missing id', required=False)
-args = parser.parse_args()
-
-from Bio import Entrez
-Entrez.email = 'another_email@gmail.com'
-
-import xml.etree.ElementTree as ET
-import json
-import os
-import requests
-import sys
-
-from datetime import date
-from dateutil.parser import parse
-
-num_ids_for_request = 100
-
-dir_metadata = 'metadata_from_nuccore'
-dir_fasta_and_yaml = 'fasta_and_yaml'
-dir_dict_ontology_standardization = 'dict_ontology_standardization/'
-
-today_date = date.today().strftime("%Y.%m.%d")
-path_ncbi_virus_accession = 'sequences.{}.acc'.format(today_date)
-
-def is_integer(string_to_check):
-    try:
-        int(string_to_check)
-        return True
-    except ValueError:
-        return False
-
-def chunks(lst, n):
-    for i in range(0, len(lst), n):
-        yield lst[i:i + n]
-
-if os.path.exists(dir_metadata):
-    print("The directory '{}' already exists.".format(dir_metadata))
-
-    if not args.skip_request:
-        print("\tTo start the request, delete the directory '{}' or specify --skip-request.".format(dir_metadata))
-        sys.exit(-1)
-
-
-accession_already_downloaded_set = []
-
-if os.path.exists(dir_fasta_and_yaml):
-    print("The directory '{}' already exists.".format(dir_fasta_and_yaml))
-    if not args.only_missing_id:
-        print("To start the download, delete the directory '{}' or specify --only-missing-id.".format(dir_fasta_and_yaml))
-        sys.exit(-1)
-
-    accession_already_downloaded_set = set([x.split('.yaml')[0].split('.')[0] for x in os.listdir(dir_fasta_and_yaml) if x.endswith('.yaml')])
-    print('There are {} accession already downloaded.'.format(len(accession_already_downloaded_set)))
-
-
-if not os.path.exists(dir_metadata):
-    # Take all the ids
-    id_set = set()
-
-    # Try to search several strings
-    term_list = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
-    for term in term_list:
-        tmp_list = Entrez.read(
-            Entrez.esearch(db='nuccore', term=term, idtype='acc', retmax='10000')
-        )['IdList']
-
-        # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq)
-        tmp_list = [x for x in tmp_list if x[:2] not in ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']]
-
-        # Remove the version in the id
-        tmp_list = [x.split('.')[0] for x in tmp_list]
-
-        #tmp_list = tmp_list[0:2] # restricting to small run
-        new_ids_set = set([x.split('.')[0] for x in tmp_list])
-        new_ids = len(new_ids_set.difference(id_set))
-        id_set.update(new_ids_set)
-
-        print('Term:', term, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set))
-
-    if not os.path.exists(path_ncbi_virus_accession):
-        r = requests.get('https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/?q=*:*&fq=%7B!tag=SeqType_s%7DSeqType_s:(%22Nucleotide%22)&fq=VirusLineageId_ss:(2697049)&cmd=download&sort=SourceDB_s%20desc,CreateDate_dt%20desc,id%20asc&dlfmt=acc&fl=id')
-        with open(path_ncbi_virus_accession, 'w') as fw:
-            fw.write(r.text)
-
-    with open(path_ncbi_virus_accession) as f:
-        tmp_list = [line.strip('\n') for line in f]
-
-    new_ids = len(set(tmp_list).difference(id_set))
-    id_set.update(tmp_list)
-
-    print('DB: NCBI Virus', today_date, '-->', new_ids, 'new IDs from', len(tmp_list), '---> Total unique IDs:', len(id_set))
-
-    if len(accession_already_downloaded_set) > 0:
-        id_set = id_set.difference(accession_already_downloaded_set)
-        print('There are {} missing IDs to download.'.format(len(id_set)))
-
-    os.makedirs(dir_metadata)
-    for i, id_x_list in enumerate(chunks(list(id_set), num_ids_for_request)):
-        path_metadata_xxx_xml = os.path.join(dir_metadata, 'metadata_{}.xml'.format(i))
-        print('Requesting {} ids --> {}'.format(len(id_x_list), path_metadata_xxx_xml))
-
-        with open(path_metadata_xxx_xml, 'w') as fw:
-            fw.write(
-                Entrez.efetch(db='nuccore', id=id_x_list, retmode='xml').read()
-            )
-
-
-term_to_uri_dict = {}
-
-for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
-    print('Read {}'.format(path_dict_xxx_csv))
-
-    with open(path_dict_xxx_csv) as f:
-        for line in f:
-            if len(line.split(',')) > 2:
-                term, uri = line.strip('\n').split('",')
-                term = term.strip('"')
-            else:
-                term, uri = line.strip('\n').split(',')
-
-            if term in term_to_uri_dict:
-                print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
-                continue
-                
-            term_to_uri_dict[term] = uri
-
-if not os.path.exists(dir_fasta_and_yaml):
-    os.makedirs(dir_fasta_and_yaml)
-
-min_len_to_count = 27500
-num_seq_with_len_ge_X_bp = 0
-
-missing_value_list = []
-accession_with_errors_list = []
-
-for path_metadata_xxx_xml in [os.path.join(dir_metadata, name_metadata_xxx_xml) for name_metadata_xxx_xml in os.listdir(dir_metadata) if name_metadata_xxx_xml.endswith('.xml')]:
-    tree = ET.parse(path_metadata_xxx_xml)
-    GBSet = tree.getroot()
-
-    for GBSeq in GBSet:
-        accession_version = GBSeq.find('GBSeq_accession-version').text
-
-        GBSeq_sequence = GBSeq.find('GBSeq_sequence')
-        if GBSeq_sequence is None:
-            print(accession_version, ' - sequence not found')
-            continue
-
-        try:
-            #print(path_metadata_xxx_xml, accession_version)
-
-            # A general default-empty yaml could be read from the definitive one
-            info_for_yaml_dict = {
-                'id': 'placeholder',
-                'host': {},
-                'sample': {},
-                'virus': {},
-                'technology': {},
-                'submitter': {}
-            }
-
-
-            info_for_yaml_dict['sample']['sample_id'] = accession_version
-            info_for_yaml_dict['sample']['source_database_accession'] = ["http://identifiers.org/insdc/"+accession_version+"#sequence"] #accession is turned into resolvable URL/URI now
-
-
-            # submitter info
-            GBSeq_references = GBSeq.find('GBSeq_references')
-            if GBSeq_references is not None:
-                info_for_yaml_dict['submitter']['authors'] = ["{}".format(x.text) for x in GBSeq_references.iter('GBAuthor')]
-
-                GBReference = GBSeq_references.find('GBReference')
-                if GBReference is not None:
-                    GBReference_journal = GBReference.find('GBReference_journal')
-
-                    if GBReference_journal is not None and GBReference_journal.text != 'Unpublished':
-                        if 'Submitted' in GBReference_journal.text:
-                            info_for_yaml_dict['submitter']['submitter_name'] = ["{}".format(GBReference_journal.text.split(') ')[1].split(',')[0].strip())]
-                            info_for_yaml_dict['submitter']['submitter_address'] = ','.join(GBReference_journal.text.split(') ')[1].split(',')[1:]).strip()
-                        else:
-                            info_for_yaml_dict['submitter']['additional_submitter_information'] = GBReference_journal.text
-
-
-            GBSeq_comment = GBSeq.find('GBSeq_comment')
-            if GBSeq_comment is not None and 'Assembly-Data' in GBSeq_comment.text:
-                prefix_split_string = '##Genome-Assembly' if GBSeq_comment.text.startswith('##Genome-') else '##Assembly'
-
-                GBSeq_comment_text = GBSeq_comment.text.split(
-                    '{}-Data-START## ; '.format(prefix_split_string)
-                )[1].split(' ; {}-Data-END##'.format(prefix_split_string))[0]
-
-                for info_to_check, field_in_yaml in zip(
-                    ['Assembly Method', 'Coverage', 'Sequencing Technology'],
-                    ['sequence_assembly_method', 'sequencing_coverage', 'sample_sequencing_technology']
-                ):
-                    if info_to_check in GBSeq_comment_text:
-                        tech_info_to_parse = GBSeq_comment_text.split('{} :: '.format(info_to_check))[1].split(' ;')[0]
-
-                        if field_in_yaml == 'sequencing_coverage':
-                            # A regular expression would be better!
-                            try:
-                                info_for_yaml_dict['technology'][field_in_yaml] = [
-                                    float(tech_info_to_parse.strip('(average)').strip("reads/nt").strip('(average for 6 sequences)').replace(',', '.').strip(' xX>'))
-                                ]
-                            except ValueError:
-                                print(accession_version, "Couldn't make sense of Coverage '%s'" % tech_info_to_parse)
-                                pass
-                        elif field_in_yaml == 'sample_sequencing_technology':
-                            new_seq_tec_list = []
-                            for seq_tec in tech_info_to_parse.split(';'):
-                                seq_tec = seq_tec.strip()
-                                if seq_tec in term_to_uri_dict:
-                                    seq_tec = term_to_uri_dict[seq_tec]
-                                else:
-                                    missing_value_list.append('\t'.join([accession_version, 'sample_sequencing_technology', seq_tec]))
-
-                                new_seq_tec_list.append(seq_tec)
-
-                            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [x for x in new_seq_tec_list]
-                        else:
-                            info_for_yaml_dict['technology'][field_in_yaml] = tech_info_to_parse
-
-
-            for GBFeature in GBSeq.iter('GBFeature'):
-                if GBFeature.find('GBFeature_key').text != 'source':
-                    continue
-
-                for GBQualifier in GBFeature.iter('GBQualifier'):
-                    GBQualifier_value = GBQualifier.find('GBQualifier_value')
-                    if GBQualifier_value is None:
-                        continue
-                    GBQualifier_value_text = GBQualifier_value.text
-
-                    GBQualifier_name_text = GBQualifier.find('GBQualifier_name').text
-
-                    if GBQualifier_name_text == 'host':
-                        GBQualifier_value_text_list = GBQualifier_value_text.split('; ')
-
-                        if GBQualifier_value_text_list[0] in term_to_uri_dict:
-                            info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[GBQualifier_value_text_list[0]]
-                        elif GBQualifier_value_text_list[0] and ('MT215193' in accession_version or 'MT270814' in accession_version):
-                            # Information checked manually from NCBI Virus
-                            info_for_yaml_dict['host']['host_species'] = term_to_uri_dict['Canis lupus familiaris']
-                        else:
-                            missing_value_list.append('\t'.join([accession_version, 'host_species', GBQualifier_value_text_list[0]]))
-
-                        # Possible cases:
-                        # - Homo sapiens						--> ['Homo sapiens']
-                        # - Homo sapiens; female				--> ['Homo sapiens', 'female']
-                        # - Homo sapiens; female 63				--> ['Homo sapiens', 'female 63']
-                        # - Homo sapiens; female; age 40		--> ['Homo sapiens', 'female', 'age 40']
-                        # - Homo sapiens; gender: F; age: 61	--> ['Homo sapiens', 'gender: F', 'age: 61']
-                        # - Homo sapiens; gender: M; age: 68	--> ['Homo sapiens', 'gender: M', 'age: 68']
-                        # - Homo sapiens; hospitalized patient	--> ['Homo sapiens', 'hospitalized patient']
-                        # - Homo sapiens; male					--> ['Homo sapiens', 'male']
-                        # - Homo sapiens; male; 63				--> ['Homo sapiens', 'male', '63']
-                        # - Homo sapiens; male; age 29			--> ['Homo sapiens', 'male', 'age 29']
-                        # - Homo sapiens; symptomatic			--> ['Homo sapiens', 'symptomatic']
-                        if len(GBQualifier_value_text_list) > 1:
-                            host_sex = ''
-                            if 'female' in GBQualifier_value_text_list[1]:
-                                host_sex = 'female'
-                            elif 'male' in GBQualifier_value_text_list[1]:
-                                host_sex = 'male'
-                            elif 'gender' in GBQualifier_value_text_list[1]:
-                                host_sex_one_lecter = GBQualifier_value_text_list[1].split(':')[-1].strip()
-                                if host_sex_one_lecter in ['F', 'M']:
-                                    host_sex = 'female' if host_sex_one_lecter == 'F' else 'male'
-
-                            if host_sex in ['male', 'female']:
-                                info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if host_sex == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
-                            elif GBQualifier_value_text_list[1] in term_to_uri_dict:
-                                info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[GBQualifier_value_text_list[1]]                            
-                            else:
-                                missing_value_list.append('\t'.join([accession_version, 'host_sex or host_health_status', GBQualifier_value_text_list[1]]))
-
-                            # Host age
-                            host_age = -1
-                            if len(GBQualifier_value_text_list[1].split(' ')) > 1 and is_integer(GBQualifier_value_text_list[1].split(' ')[-1]):
-                                host_age = int(GBQualifier_value_text_list[1].split(' ')[-1])
-                            elif len(GBQualifier_value_text_list) > 2 and is_integer(GBQualifier_value_text_list[2].split(' ')[-1]):
-                                host_age = int(GBQualifier_value_text_list[2].split(' ')[-1])
-
-                            if host_age > -1:
-                                info_for_yaml_dict['host']['host_age'] = host_age
-                                info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036'
-                            elif len(GBQualifier_value_text_list) > 2:
-                                missing_value_list.append('\t'.join([accession_version, 'host_age', GBQualifier_value_text_list[2]]))
-                    elif GBQualifier_name_text == 'collected_by':
-                        if any([x in GBQualifier_value_text.lower() for x in ['institute', 'hospital', 'city', 'center']]):
-                            info_for_yaml_dict['sample']['collecting_institution'] = GBQualifier_value_text
-                        else:
-                            info_for_yaml_dict['sample']['collector_name'] = GBQualifier_value_text
-                    elif GBQualifier_name_text == 'isolation_source':
-                        if GBQualifier_value_text.upper() in term_to_uri_dict:
-                            GBQualifier_value_text = GBQualifier_value_text.upper() # For example, in case of 'usa: wa'
-
-                        # Little cleaning
-                        GBQualifier_value_text = GBQualifier_value_text.strip("/'")
-
-                        if GBQualifier_value_text in term_to_uri_dict:
-                            info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[GBQualifier_value_text]]
-                        else:
-                            if GBQualifier_value_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
-                            elif GBQualifier_value_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
-                            elif GBQualifier_value_text.lower() in ['nasal swab and throat swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
-                            elif GBQualifier_value_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
-                                info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
-                            else:
-                                missing_value_list.append('\t'.join([accession_version, 'specimen_source', GBQualifier_value_text]))
-                    elif GBQualifier_name_text == 'collection_date':
-                        # TO_DO: which format we will use?
-                        date_to_write = GBQualifier_value_text
-
-                        if len(GBQualifier_value_text.split('-')) == 1:
-                            if int(GBQualifier_value_text) < 2020:
-                                date_to_write = "{}-12-15".format(GBQualifier_value_text)
-                            else:
-                                date_to_write = "{}-01-15".format(GBQualifier_value_text)
-
-                            if 'additional_collection_information' in info_for_yaml_dict['sample']:
-                                info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
-                            else:
-                                info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
-                        elif len(GBQualifier_value_text.split('-')) == 2:
-                            date_to_write += '-15'
-
-                            if 'additional_collection_information' in info_for_yaml_dict['sample']:
-                                info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
-                            else:
-                                info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(GBQualifier_value_text)
-                        elif len(GBQualifier_value_text.split('-')) == 3:
-                            GBQualifier_value_text_list = GBQualifier_value_text.split('-')
-
-                            if GBQualifier_value_text_list[1].isalpha():
-                                date_to_write = parse(GBQualifier_value_text).strftime('%Y-%m-%d')
-
-                        info_for_yaml_dict['sample']['collection_date'] = date_to_write
-                    elif GBQualifier_name_text in ['lat_lon', 'country']:
-                        if GBQualifier_value_text == 'Hong Kong':
-                            GBQualifier_value_text = 'China: Hong Kong'
-
-                        if GBQualifier_value_text in term_to_uri_dict:
-                            info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[GBQualifier_value_text]
-                        else:
-                            missing_value_list.append('\t'.join([accession_version, GBQualifier_name_text, GBQualifier_value_text]))
-                    elif GBQualifier_name_text == 'note':
-                        if 'additional_collection_information' in info_for_yaml_dict['sample']:
-                            info_for_yaml_dict['sample']['additional_collection_information'] += '; ' + GBQualifier_value_text
-                        else:
-                            info_for_yaml_dict['sample']['additional_collection_information'] = GBQualifier_value_text
-                    elif GBQualifier_name_text == 'isolate':
-                        info_for_yaml_dict['virus']['virus_strain'] = GBQualifier_value_text
-                    elif GBQualifier_name_text == 'db_xref':
-                        info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+GBQualifier_value_text.split('taxon:')[1]
-
-
-            # Remove technology key if empty!
-            if (info_for_yaml_dict['technology']=={}):
-                del info_for_yaml_dict['technology']
-
-            with open(os.path.join(dir_fasta_and_yaml, '{}.fasta'.format(accession_version)), 'w') as fw:
-                fw.write('>{}\n{}'.format(accession_version, GBSeq_sequence.text.upper()))
-
-            with open(os.path.join(dir_fasta_and_yaml, '{}.yaml'.format(accession_version)), 'w') as fw:
-                json.dump(info_for_yaml_dict, fw, indent=2)
-
-
-            if(len(GBSeq_sequence.text) >= min_len_to_count):
-                num_seq_with_len_ge_X_bp += 1
-        except:
-            print("Unexpected error for the ID {}: {}".format(accession_version, sys.exc_info()[0]))
-            accession_with_errors_list.append(accession_version)
-            continue
-
-if len(missing_value_list) > 0:
-    path_missing_terms_tsv = 'missing_terms.tsv'
-    print('Written missing terms in {}'.format(path_missing_terms_tsv))
-    with open(path_missing_terms_tsv, 'w') as fw:
-        fw.write('\n'.join(missing_value_list))
-
-if len(accession_with_errors_list) > 0:
-    path_accession_with_errors_tsv = 'accession_with_errors.tsv'
-    print('Written the accession with errors in {}'.format(path_accession_with_errors_tsv))
-    with open(path_accession_with_errors_tsv, 'w') as fw:
-        fw.write('\n'.join(accession_with_errors_list))
-        
-print('Num. new sequences with length >= {} bp: {}'.format(min_len_to_count, num_seq_with_len_ge_X_bp))
diff --git a/scripts/sequences.acc b/scripts/sequences.acc
deleted file mode 100644
index 697d868..0000000
--- a/scripts/sequences.acc
+++ /dev/null
@@ -1,1731 +0,0 @@
-NC_045512
-MT394528
-MT394529
-MT394530
-MT394531
-MT394864
-MT396241
-MT396242
-MT396243
-MT396244
-MT396245
-MT396246
-MT396247
-MT396248
-MT396266
-MT380726
-MT380727
-MT380728
-MT380729
-MT380730
-MT380731
-MT380732
-MT380733
-MT380734
-MT385414
-MT385415
-MT385416
-MT385417
-MT385418
-MT385419
-MT385420
-MT385421
-MT385422
-MT385423
-MT385424
-MT385425
-MT385426
-MT385427
-MT385428
-MT385429
-MT385430
-MT385431
-MT385432
-MT385433
-MT385434
-MT385435
-MT385436
-MT385437
-MT385438
-MT385439
-MT385440
-MT385441
-MT385442
-MT385443
-MT385444
-MT385445
-MT385446
-MT385447
-MT385448
-MT385449
-MT385450
-MT385451
-MT385452
-MT385453
-MT385454
-MT385455
-MT385456
-MT385457
-MT385458
-MT385459
-MT385460
-MT385461
-MT385462
-MT385463
-MT385464
-MT385465
-MT385466
-MT385467
-MT385468
-MT385469
-MT385470
-MT385471
-MT385472
-MT385473
-MT385474
-MT385475
-MT385476
-MT385477
-MT385478
-MT385479
-MT385480
-MT385481
-MT385482
-MT385483
-MT385484
-MT385485
-MT385486
-MT385487
-MT385488
-MT385489
-MT385490
-MT385491
-MT385492
-MT385493
-MT385494
-MT385495
-MT385496
-MT385497
-MT186683
-MT252677
-MT252678
-MT252679
-MT252680
-MT252681
-MT252682
-MT252683
-MT252684
-MT252685
-MT252686
-MT252687
-MT252688
-MT252689
-MT252690
-MT252691
-MT252692
-MT252693
-MT252694
-MT252695
-MT252696
-MT252697
-MT252698
-MT252699
-MT252700
-MT252701
-MT252702
-MT252703
-MT252704
-MT252705
-MT252706
-MT252707
-MT252708
-MT252709
-MT252710
-MT252711
-MT252712
-MT252713
-MT252715
-MT252716
-MT252717
-MT252719
-MT252721
-MT252723
-MT252725
-MT252726
-MT252728
-MT252729
-MT252730
-MT252733
-MT252734
-MT252735
-MT252736
-MT252737
-MT252738
-MT252739
-MT252740
-MT252741
-MT252742
-MT252745
-MT252746
-MT252747
-MT252748
-MT252749
-MT252756
-MT252757
-MT252758
-MT252761
-MT252763
-MT252764
-MT252765
-MT252766
-MT252767
-MT252768
-MT252769
-MT252770
-MT252771
-MT252772
-MT252773
-MT252774
-MT252775
-MT252778
-MT252779
-MT252780
-MT252781
-MT252782
-MT252783
-MT252784
-MT252785
-MT252787
-MT252788
-MT252792
-MT252793
-MT252794
-MT252795
-MT252797
-MT252798
-MT252799
-MT252800
-MT252801
-MT252802
-MT252803
-MT252804
-MT252805
-MT252806
-MT252807
-MT252808
-MT252809
-MT252810
-MT252811
-MT252821
-MT252822
-MT252823
-MT252824
-MT339043
-MT365033
-MT374101
-MT374102
-MT374103
-MT374104
-MT374105
-MT374106
-MT374107
-MT374108
-MT374109
-MT374110
-MT374111
-MT374112
-MT374113
-MT374114
-MT374115
-MT374116
-MT375428
-MT375429
-MT375430
-MT375431
-MT375432
-MT375433
-MT375434
-MT375435
-MT375436
-MT375437
-MT375438
-MT375439
-MT375440
-MT375441
-MT375442
-MT375443
-MT375444
-MT375445
-MT375446
-MT375447
-MT375448
-MT375449
-MT375450
-MT375451
-MT375452
-MT375453
-MT375454
-MT375455
-MT375456
-MT375457
-MT375458
-MT375459
-MT375460
-MT375461
-MT375462
-MT375463
-MT375464
-MT375465
-MT375466
-MT375467
-MT375468
-MT375469
-MT375470
-MT375471
-MT375472
-MT375473
-MT375474
-MT375475
-MT375476
-MT375477
-MT375478
-MT375479
-MT375480
-MT375481
-MT375482
-MT375483
-MT370516
-MT370517
-MT370518
-MT370831
-MT370832
-MT370833
-MT370834
-MT370835
-MT370836
-MT370837
-MT370838
-MT370839
-MT370840
-MT370841
-MT370842
-MT370843
-MT370844
-MT370845
-MT370846
-MT370847
-MT370848
-MT370849
-MT370850
-MT370851
-MT370852
-MT370853
-MT370854
-MT370855
-MT370856
-MT370857
-MT370858
-MT370859
-MT370860
-MT370861
-MT370862
-MT370863
-MT370864
-MT370865
-MT370866
-MT370867
-MT370868
-MT370869
-MT370870
-MT370871
-MT370872
-MT370873
-MT370874
-MT370875
-MT370876
-MT370877
-MT370878
-MT370879
-MT370880
-MT370881
-MT370882
-MT370883
-MT370884
-MT370885
-MT370886
-MT370887
-MT370888
-MT370889
-MT370890
-MT370891
-MT370892
-MT370893
-MT370894
-MT370895
-MT370896
-MT370897
-MT370898
-MT370899
-MT370900
-MT370901
-MT370902
-MT370903
-MT370904
-MT370905
-MT370906
-MT370907
-MT370908
-MT370909
-MT370910
-MT370911
-MT370912
-MT370913
-MT370914
-MT370915
-MT370916
-MT370917
-MT370918
-MT370919
-MT370920
-MT370921
-MT370922
-MT370923
-MT370924
-MT370925
-MT370926
-MT370927
-MT370928
-MT370929
-MT370930
-MT370931
-MT370932
-MT370933
-MT370934
-MT370935
-MT370936
-MT370937
-MT370938
-MT370939
-MT370940
-MT370941
-MT370942
-MT370943
-MT370944
-MT370945
-MT370946
-MT370947
-MT370948
-MT370949
-MT370950
-MT370951
-MT370952
-MT370953
-MT370954
-MT370955
-MT370956
-MT370957
-MT370958
-MT370959
-MT370960
-MT370961
-MT370962
-MT370963
-MT370964
-MT370965
-MT370966
-MT370967
-MT370968
-MT370969
-MT370970
-MT370971
-MT370972
-MT370973
-MT370974
-MT370975
-MT370976
-MT370977
-MT370978
-MT370979
-MT370980
-MT370981
-MT370982
-MT370983
-MT370984
-MT370985
-MT370986
-MT370987
-MT370988
-MT370989
-MT370990
-MT370991
-MT370992
-MT370993
-MT370994
-MT370995
-MT370996
-MT370997
-MT370998
-MT370999
-MT371000
-MT371001
-MT371002
-MT371003
-MT371004
-MT371005
-MT371006
-MT371007
-MT371008
-MT371009
-MT371010
-MT371011
-MT371012
-MT371013
-MT371014
-MT371015
-MT371016
-MT371017
-MT371018
-MT371019
-MT371020
-MT371021
-MT371022
-MT371023
-MT371024
-MT371025
-MT371026
-MT371027
-MT371028
-MT371029
-MT371030
-MT371031
-MT371032
-MT371033
-MT371034
-MT371035
-MT371036
-MT371037
-MT371038
-MT371047
-MT371048
-MT371049
-MT371050
-MT371568
-MT371569
-MT371570
-MT371571
-MT371572
-MT371573
-MT371574
-MT372480
-MT372481
-MT372482
-MT372483
-7BV2_P
-7BV2_T
-LC542976
-LC542809
-MT114412
-MT114413
-MT114414
-MT114415
-MT114416
-MT114417
-MT114418
-MT114419
-MT230904
-MT358401
-MT358402
-MT358637
-MT358644
-MT358645
-MT358646
-MT358647
-MT358648
-MT358649
-MT358650
-MT358651
-MT358652
-MT358653
-MT358654
-MT358655
-MT358656
-MT358657
-MT358658
-MT358659
-MT358660
-MT358661
-MT358662
-MT358663
-MT358664
-MT358665
-MT358666
-MT358667
-MT358668
-MT358669
-MT358670
-MT358671
-MT358672
-MT358673
-MT358674
-MT358675
-MT358676
-MT358677
-MT358678
-MT358679
-MT358680
-MT358681
-MT358682
-MT358683
-MT358684
-MT358685
-MT358686
-MT358687
-MT358688
-MT358689
-MT358690
-MT358691
-MT358692
-MT358693
-MT358694
-MT358695
-MT358696
-MT358697
-MT358698
-MT358699
-MT358700
-MT358701
-MT358702
-MT358703
-MT358704
-MT358705
-MT358706
-MT358707
-MT358708
-MT358709
-MT358710
-MT358711
-MT358712
-MT358713
-MT358714
-MT358715
-MT358716
-MT358717
-MT358718
-MT358719
-MT358720
-MT358721
-MT358722
-MT358723
-MT358724
-MT358725
-MT358726
-MT358727
-MT358728
-MT358729
-MT358730
-MT358731
-MT358732
-MT358733
-MT358734
-MT358735
-MT358736
-MT358737
-MT358738
-MT358739
-MT358740
-MT358741
-MT358742
-MT358743
-MT358744
-MT358745
-MT358746
-MT358747
-MT358748
-MT359231
-MT359865
-MT359866
-MT350234
-MT350236
-MT350237
-MT350238
-MT350239
-MT350240
-MT350241
-MT350242
-MT350243
-MT350244
-MT350245
-MT350246
-MT350247
-MT350248
-MT350249
-MT350250
-MT350251
-MT350252
-MT350253
-MT350254
-MT350255
-MT350256
-MT350257
-MT350263
-MT350264
-MT350265
-MT350266
-MT350267
-MT350268
-MT350269
-MT350270
-MT350271
-MT350272
-MT350273
-MT350274
-MT350275
-MT350276
-MT350277
-MT350278
-MT350279
-MT350280
-MT350282
-MT344135
-MT344944
-MT344945
-MT344946
-MT344947
-MT344948
-MT344949
-MT344950
-MT344951
-MT344952
-MT344953
-MT344954
-MT344955
-MT344956
-MT344957
-MT344958
-MT344959
-MT344960
-MT344961
-MT344962
-MT344963
-MT345798
-MT345799
-MT345800
-MT345801
-MT345802
-MT345803
-MT345804
-MT345805
-MT345806
-MT345807
-MT345808
-MT345809
-MT345810
-MT345811
-MT345812
-MT345813
-MT345814
-MT345815
-MT345816
-MT345817
-MT345818
-MT345819
-MT345820
-MT345821
-MT345822
-MT345823
-MT345824
-MT345825
-MT345826
-MT345827
-MT345828
-MT345829
-MT345830
-MT345831
-MT345832
-MT345833
-MT345834
-MT345835
-MT345836
-MT345837
-MT345838
-MT345839
-MT345840
-MT345841
-MT345842
-MT345843
-MT345844
-MT345845
-MT345846
-MT345847
-MT345848
-MT345849
-MT345850
-MT345851
-MT345852
-MT345853
-MT345854
-MT345855
-MT345856
-MT345857
-MT345858
-MT345859
-MT345860
-MT345861
-MT345862
-MT345863
-MT345864
-MT345865
-MT345866
-MT345867
-MT345868
-MT345869
-MT345870
-MT345871
-MT345872
-MT345873
-MT345874
-MT345875
-MT345876
-MT345877
-MT345878
-MT345879
-MT345880
-MT345881
-MT345882
-MT345883
-MT345884
-MT345885
-MT345886
-MT345887
-MT345888
-MT339039
-MT339040
-MT339041
-MT334522
-MT334523
-MT334524
-MT334525
-MT334526
-MT334527
-MT334528
-MT334529
-MT334530
-MT334531
-MT334532
-MT334533
-MT334534
-MT334535
-MT334536
-MT334537
-MT334538
-MT334539
-MT334540
-MT334541
-MT334542
-MT334543
-MT334544
-MT334545
-MT334546
-MT334547
-MT334548
-MT334549
-MT334550
-MT334551
-MT334552
-MT334553
-MT334554
-MT334555
-MT334556
-MT334557
-MT334558
-MT334559
-MT334560
-MT334561
-MT334562
-MT334563
-MT334564
-MT334565
-MT334566
-MT334567
-MT334568
-MT334569
-MT334570
-MT334571
-MT334572
-MT334573
-MT324062
-MT324679
-MT324680
-MT324681
-MT324682
-MT324683
-MT324684
-MT325561
-MT325562
-MT325563
-MT325564
-MT325565
-MT325566
-MT325567
-MT325568
-MT325569
-MT325570
-MT325571
-MT325572
-MT325573
-MT325574
-MT325575
-MT325576
-MT325577
-MT325578
-MT325579
-MT325580
-MT325581
-MT325582
-MT325583
-MT325584
-MT325585
-MT325586
-MT325587
-MT325588
-MT325589
-MT325590
-MT325591
-MT325592
-MT325593
-MT325594
-MT325595
-MT325596
-MT325597
-MT325598
-MT325599
-MT325600
-MT325601
-MT325602
-MT325603
-MT325604
-MT325605
-MT325606
-MT325607
-MT325608
-MT325609
-MT325610
-MT325611
-MT325612
-MT325613
-MT325614
-MT325615
-MT325616
-MT325617
-MT325618
-MT325619
-MT325620
-MT325621
-MT325622
-MT325623
-MT325624
-MT325625
-MT325626
-MT325627
-MT325628
-MT325629
-MT325630
-MT325631
-MT325632
-MT325633
-MT325634
-MT325635
-MT325636
-MT325637
-MT325638
-MT325639
-MT325640
-MT326023
-MT326024
-MT326025
-MT326026
-MT326027
-MT326028
-MT326029
-MT326030
-MT326031
-MT326032
-MT326033
-MT326034
-MT326035
-MT326036
-MT326037
-MT326038
-MT326039
-MT326040
-MT326041
-MT326042
-MT326043
-MT326044
-MT326045
-MT326046
-MT326047
-MT326048
-MT326049
-MT326050
-MT326051
-MT326052
-MT326053
-MT326054
-MT326055
-MT326056
-MT326057
-MT326058
-MT326059
-MT326060
-MT326061
-MT326062
-MT326063
-MT326064
-MT326065
-MT326066
-MT326067
-MT326068
-MT326069
-MT326070
-MT326071
-MT326072
-MT326073
-MT326074
-MT326075
-MT326076
-MT326077
-MT326078
-MT326079
-MT326080
-MT326081
-MT326082
-MT326083
-MT326084
-MT326085
-MT326086
-MT326087
-MT326088
-MT326089
-MT326090
-MT326091
-MT326092
-MT326093
-MT326094
-MT326095
-MT326096
-MT326097
-MT326098
-MT326099
-MT326100
-MT326101
-MT326102
-MT326103
-MT326104
-MT326105
-MT326106
-MT326107
-MT326108
-MT326109
-MT326110
-MT326111
-MT326112
-MT326113
-MT326114
-MT326115
-MT326116
-MT326117
-MT326118
-MT326119
-MT326120
-MT326121
-MT326122
-MT326123
-MT326124
-MT326125
-MT326126
-MT326127
-MT326128
-MT326129
-MT326130
-MT326131
-MT326132
-MT326133
-MT326134
-MT326135
-MT326136
-MT326137
-MT326138
-MT326139
-MT326140
-MT326141
-MT326142
-MT326143
-MT326144
-MT326145
-MT326146
-MT326147
-MT326148
-MT326149
-MT326150
-MT326151
-MT326152
-MT326153
-MT326154
-MT326155
-MT326156
-MT326157
-MT326158
-MT326159
-MT326160
-MT326161
-MT326162
-MT326163
-MT326164
-MT326165
-MT326166
-MT326167
-MT326168
-MT326169
-MT326170
-MT326171
-MT326172
-MT326173
-MT326174
-MT326175
-MT326176
-MT326177
-MT326178
-MT326179
-MT326180
-MT326181
-MT326182
-MT326183
-MT326184
-MT326185
-MT326186
-MT326187
-MT326188
-MT326189
-MT326190
-MT326191
-MT327745
-MT328032
-MT328033
-MT328034
-MT328035
-MT039874
-MT077125
-MT322394
-MT322395
-MT322396
-MT322397
-MT322398
-MT322399
-MT322400
-MT322401
-MT322402
-MT322403
-MT322404
-MT322405
-MT322406
-MT322407
-MT322408
-MT322409
-MT322410
-MT322411
-MT322412
-MT322413
-MT322414
-MT322415
-MT322416
-MT322417
-MT322418
-MT322419
-MT322420
-MT322421
-MT322422
-MT322423
-MT322424
-MT320538
-MT320891
-MT308692
-MT308693
-MT308694
-MT308695
-MT308696
-MT308697
-MT308698
-MT308699
-MT308700
-MT308701
-MT308702
-MT308703
-MT308704
-MT293547
-MT300186
-MT304474
-MT304475
-MT304476
-MT304477
-MT304478
-MT304479
-MT304480
-MT304481
-MT304482
-MT304483
-MT304484
-MT304485
-MT304486
-MT304487
-MT304488
-MT304489
-MT304490
-MT304491
-MT273658
-MT281530
-MT281577
-MT291826
-MT291827
-MT291828
-MT291829
-MT291830
-MT291831
-MT291832
-MT291833
-MT291834
-MT291835
-MT291836
-MT292569
-MT292570
-MT292571
-MT292572
-MT292573
-MT292574
-MT292575
-MT292576
-MT292577
-MT292578
-MT292579
-MT292580
-MT292581
-MT292582
-MT293156
-MT293157
-MT293158
-MT293159
-MT293160
-MT293161
-MT293162
-MT293163
-MT293164
-MT293165
-MT293166
-MT293167
-MT293168
-MT293169
-MT293170
-MT293171
-MT293172
-MT293173
-MT293174
-MT293175
-MT293176
-MT293177
-MT293178
-MT293179
-MT293180
-MT293181
-MT293182
-MT293183
-MT293184
-MT293185
-MT293186
-MT293187
-MT293188
-MT293189
-MT293190
-MT293191
-MT293192
-MT293193
-MT293194
-MT293195
-MT293196
-MT293197
-MT293198
-MT293199
-MT293200
-MT293201
-MT293202
-MT293203
-MT293204
-MT293205
-MT293206
-MT293207
-MT293208
-MT293209
-MT293210
-MT293211
-MT293212
-MT293213
-MT293214
-MT293215
-MT293216
-MT293217
-MT293218
-MT293219
-MT293220
-MT293221
-MT293222
-MT293223
-MT293224
-MT293225
-MT295464
-MT295465
-MT276323
-MT276324
-MT276325
-MT276326
-MT276327
-MT276328
-MT276329
-MT276330
-MT276331
-MT276597
-MT276598
-MT262896
-MT262897
-MT262898
-MT262899
-MT262900
-MT262901
-MT262902
-MT262903
-MT262904
-MT262905
-MT262906
-MT262907
-MT262908
-MT262909
-MT262910
-MT262911
-MT262912
-MT262913
-MT262914
-MT262915
-MT262916
-MT262993
-MT263074
-MT263381
-MT263382
-MT263383
-MT263384
-MT263385
-MT263386
-MT263387
-MT263388
-MT263389
-MT263390
-MT263391
-MT263392
-MT263393
-MT263394
-MT263395
-MT263396
-MT263397
-MT263398
-MT263399
-MT263400
-MT263401
-MT263402
-MT263403
-MT263404
-MT263405
-MT263406
-MT263407
-MT263408
-MT263409
-MT263410
-MT263411
-MT263412
-MT263413
-MT263414
-MT263415
-MT263416
-MT263417
-MT263418
-MT263419
-MT263420
-MT263421
-MT263422
-MT263423
-MT263424
-MT263425
-MT263426
-MT263427
-MT263428
-MT263429
-MT263430
-MT263431
-MT263432
-MT263433
-MT263434
-MT263435
-MT263436
-MT263437
-MT263438
-MT263439
-MT263440
-MT263441
-MT263442
-MT263443
-MT263444
-MT263445
-MT263446
-MT263447
-MT263448
-MT263449
-MT263450
-MT263451
-MT263452
-MT263453
-MT263454
-MT263455
-MT263456
-MT263457
-MT263458
-MT263459
-MT263460
-MT263461
-MT263462
-MT263463
-MT263464
-MT263465
-MT263466
-MT263467
-MT263468
-MT263469
-MT256917
-MT256918
-MT256924
-MT258377
-MT258378
-MT258379
-MT258380
-MT258381
-MT258382
-MT258383
-MT259226
-MT259227
-MT259228
-MT259229
-MT259230
-MT259231
-MT259235
-MT259236
-MT259237
-MT259238
-MT259239
-MT259240
-MT259241
-MT259242
-MT259243
-MT259244
-MT259245
-MT259246
-MT259247
-MT259248
-MT259249
-MT259250
-MT259251
-MT259252
-MT259253
-MT259254
-MT259255
-MT259256
-MT259257
-MT259258
-MT259259
-MT259260
-MT259261
-MT259262
-MT259263
-MT259264
-MT259265
-MT259266
-MT259267
-MT259268
-MT259269
-MT259270
-MT259271
-MT259272
-MT259273
-MT259274
-MT259275
-MT259276
-MT259277
-MT259278
-MT259279
-MT259280
-MT259281
-MT259282
-MT259283
-MT259284
-MT259285
-MT259286
-MT259287
-LC534418
-LC534419
-MT251972
-MT251973
-MT251974
-MT251975
-MT251976
-MT251977
-MT251978
-MT251979
-MT251980
-MT253696
-MT253697
-MT253698
-MT253699
-MT253700
-MT253701
-MT253702
-MT253703
-MT253704
-MT253705
-MT253706
-MT253707
-MT253708
-MT253709
-MT253710
-MT233526
-MT246449
-MT246450
-MT246451
-MT246452
-MT246453
-MT246454
-MT246455
-MT246456
-MT246457
-MT246458
-MT246459
-MT246460
-MT246461
-MT246462
-MT246463
-MT246464
-MT246465
-MT246466
-MT246467
-MT246468
-MT246469
-MT246470
-MT246471
-MT246472
-MT246473
-MT246474
-MT246475
-MT246476
-MT246477
-MT246478
-MT246479
-MT246480
-MT246481
-MT246482
-MT246483
-MT246484
-MT246485
-MT246486
-MT246487
-MT246488
-MT246489
-MT246490
-MT246667
-MT240479
-MT232869
-MT232870
-MT232871
-MT232872
-MT233519
-MT233520
-MT233521
-MT233522
-MT233523
-MT226610
-MT198651
-MT198652
-MT198653
-MT192758
-MT192759
-MT192765
-MT192772
-MT192773
-MT186676
-MT186677
-MT186678
-MT186679
-MT186680
-MT186681
-MT186682
-MT187977
-MT188339
-MT188340
-MT188341
-CADDYA000000000
-MT184907
-MT184908
-MT184909
-MT184910
-MT184911
-MT184912
-MT184913
-MT163712
-MT163714
-MT163715
-MT163716
-MT163717
-MT163718
-MT163719
-MT163720
-MT163721
-MT163737
-MT163738
-MT066156
-MT121215
-MT159705
-MT159706
-MT159707
-MT159708
-MT159709
-MT159710
-MT159711
-MT159712
-MT159713
-MT159714
-MT159715
-MT159716
-MT159717
-MT159718
-MT159719
-MT159720
-MT159721
-MT159722
-MT159778
-MT161607
-LC529905
-MT012098
-MT050493
-MT152824
-MT152900
-MT135041
-MT135042
-MT135043
-MT135044
-MT126808
-MT127113
-MT127114
-MT127115
-MT127116
-LC528232
-LC528233
-MT123290
-MT123291
-MT123292
-MT123293
-MT118835
-MT111895
-MT111896
-MT106052
-MT106053
-MT106054
-MT093571
-MT093631
-MT081059
-MT081060
-MT081061
-MT081062
-MT081063
-MT081064
-MT081065
-MT081066
-MT081067
-MT081068
-MT072667
-MT072668
-MT072688
-MT066157
-MT066158
-MT066159
-MT066175
-MT066176
-LC523807
-LC523808
-LC523809
-MT042773
-MT042774
-MT042775
-MT042776
-MT042777
-MT042778
-MT044257
-MT044258
-MT049951
-MT050414
-MT050415
-MT050416
-MT050417
-MT039873
-MT039887
-MT039888
-MT039890
-LC522350
-MT027062
-MT027063
-MT027064
-MT019529
-MT019530
-MT019531
-MT019532
-MT019533
-MT020781
-MT020880
-MT020881
-LR757995
-LR757996
-LR757997
-LR757998
-MT007544
-MT008022
-MT008023
-MN996527
-MN996528
-MN996529
-MN996530
-MN996531
-MN988668
-MN988669
-MN994467
-MN994468
-MN997409
-MN988713
-MN938384
-MN938385
-MN938386
-MN938387
-MN938388
-MN938389
-MN938390
-MN975262
-MN975263
-MN975264
-MN975265
-MN975266
-MN975267
-MN975268
-MN985325
-MN970003
-MN970004
-MN908947
-- 
cgit 1.4.1


From 640675e07505dc0f7969480fe0643653e6408ec2 Mon Sep 17 00:00:00 2001
From: AndreaGuarracino
Date: Mon, 22 Jun 2020 23:29:35 +0200
Subject: added new script to prepare metadata of sra data

---
 .../SraExperimentPackage.2020.06.08.xml.gz         | Bin 0 -> 3227724 bytes
 scripts/download_sra_data/download_sra_data.py     | 232 +++++++++++++++++++++
 2 files changed, 232 insertions(+)
 create mode 100644 scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz
 create mode 100644 scripts/download_sra_data/download_sra_data.py

diff --git a/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz b/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz
new file mode 100644
index 0000000..f9cd995
Binary files /dev/null and b/scripts/download_sra_data/SraExperimentPackage.2020.06.08.xml.gz differ
diff --git a/scripts/download_sra_data/download_sra_data.py b/scripts/download_sra_data/download_sra_data.py
new file mode 100644
index 0000000..9145a43
--- /dev/null
+++ b/scripts/download_sra_data/download_sra_data.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+
+import os
+from dateutil.parser import parse
+import xml.etree.ElementTree as ET
+import json
+import gzip
+
+dir_yaml = 'yaml'
+
+date = '2020.06.08'
+
+# Query on SRA: 'txid2697049[Organism]' (https://www.ncbi.nlm.nih.gov/sra/?term=txid2697049%5BOrganism%5D) -> Send to -> File -> Full XML -> Create File
+path_sra_metadata_xml = 'SraExperimentPackage.{}.xml.gz'.format(date)
+
+dir_dict_ontology_standardization = '../dict_ontology_standardization/'
+path_sra_study_accessions_txt = 'SRAStudyAccessions.{}.txt'.format(date)
+
+term_to_uri_dict = {}
+
+for path_dict_xxx_csv in [os.path.join(dir_dict_ontology_standardization, name_xxx_csv) for name_xxx_csv in os.listdir(dir_dict_ontology_standardization) if name_xxx_csv.endswith('.csv')]:
+    print('Read {}'.format(path_dict_xxx_csv))
+
+    with open(path_dict_xxx_csv, 'r') as f:
+        for line in f:
+            if len(line.split(',')) > 2:
+                term, uri = line.strip('\n').split('",')
+                term = term.strip('"')
+            else:
+                term, uri = line.strip('\n').split(',')
+
+            term_to_uri_dict[term] = uri
+
+def is_integer(string_to_check):
+    try:
+        int(string_to_check)
+        return True
+    except ValueError:
+        return False
+
+if not os.path.exists(dir_yaml):
+    os.makedirs(dir_yaml)
+
+sra_metadata_xml_file = gzip.open(path_sra_metadata_xml, 'r')
+tree = ET.parse(sra_metadata_xml_file)
+sra_metadata_xml_file.close()
+
+EXPERIMENT_PACKAGE_SET = tree.getroot()
+
+missing_value_list = []
+
+run_accession_set = set()
+run_accession_to_downloadble_file_url_dict = {}
+
+for i, EXPERIMENT_PACKAGE in enumerate(EXPERIMENT_PACKAGE_SET):
+    #print(i, EXPERIMENT_PACKAGE)
+    
+    # A general default-empty yaml could be read from the definitive one
+    info_for_yaml_dict = {
+        'id': 'placeholder',
+        'host': {},
+        'sample': {},
+        'virus': {},
+        'technology': {},
+        'submitter': {}
+    }
+
+    RUN_SET = EXPERIMENT_PACKAGE.find('RUN_SET')
+    RUN = RUN_SET.find('RUN')
+    accession = RUN.attrib['accession']
+    run_accession_set.add(accession)
+    #print(accession)
+    
+    info_for_yaml_dict['sample']['sample_id'] = accession
+    
+    SRAFiles = RUN.find('SRAFiles')
+    if SRAFiles is not None:
+        url = SRAFiles.find('SRAFile').attrib['url']
+        if 'sra-download.ncbi.nlm.nih.gov' in url:
+            run_accession_to_downloadble_file_url_dict[accession] = url
+  
+
+    SAMPLE = EXPERIMENT_PACKAGE.find('SAMPLE')
+    SAMPLE_ATTRIBUTE_list = SAMPLE.iter('SAMPLE_ATTRIBUTE')
+    
+    for SAMPLE_ATTRIBUTE in SAMPLE_ATTRIBUTE_list:
+        VALUE = SAMPLE_ATTRIBUTE.find('VALUE')
+        if VALUE is not None:
+            TAG_text = SAMPLE_ATTRIBUTE.find('TAG').text
+            VALUE_text = VALUE.text
+
+            if TAG_text in ['host', 'host scientific name']:
+                if VALUE_text in term_to_uri_dict:
+                    info_for_yaml_dict['host']['host_species'] = term_to_uri_dict[VALUE_text]
+                else:
+                    missing_value_list.append('\t'.join([accession, 'host_species', VALUE_text]))
+            elif TAG_text in ['host_health_status', 'host health state']:
+                if VALUE_text in term_to_uri_dict:
+                    info_for_yaml_dict['host']['host_health_status'] = term_to_uri_dict[VALUE_text]                            
+                elif VALUE_text.strip("'") not in ['missing', 'not collected', 'not provided']:
+                    missing_value_list.append('\t'.join([accession, 'host_health_status', VALUE_text]))
+            elif TAG_text in ['strain', 'isolate']:
+                if VALUE_text.lower() not in ['not applicable', 'missing', 'na', 'unknown']:
+                    if 'virus_strain' not in info_for_yaml_dict:
+                        info_for_yaml_dict['virus']['virus_strain'] = VALUE_text
+                    else:
+                        info_for_yaml_dict['virus']['virus_strain'] += '; ' + VALUE_text
+            elif TAG_text in ['isolation_source', 'isolation source host-associated']:                    
+                if VALUE_text in term_to_uri_dict:
+                    info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict[VALUE_text]]
+                else:
+                    if VALUE_text.lower() in ['np/op', 'np/op swab', 'np/np swab', 'nasopharyngeal and oropharyngeal swab', 'nasopharyngeal/oropharyngeal swab', 'combined nasopharyngeal and oropharyngeal swab']:
+                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['oropharyngeal swab']]
+                    elif VALUE_text.lower() in ['nasopharyngeal swab/throat swab', 'nasopharyngeal/throat swab', 'nasopharyngeal swab and throat swab', 'nasal swab and throat swab', 'nasopharyngeal aspirate/throat swab']:
+                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasopharyngeal swab'], term_to_uri_dict['throat swab']]
+                    elif VALUE_text.lower() in ['nasal swab and throat swab']:
+                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['throat swab']]
+                    elif VALUE_text.lower() in ['nasal-swab and oro-pharyngeal swab']:
+                        info_for_yaml_dict['sample']['specimen_source'] = [term_to_uri_dict['nasal swab'], term_to_uri_dict['oropharyngeal swab']]
+                    elif VALUE_text.strip("'") not in ['missing', 'not collected', 'unknown', 'not provided', 'not applicable', 'N/A']:
+                        missing_value_list.append('\t'.join([accession, 'specimen_source', VALUE_text]))
+            elif TAG_text in ['host_sex', 'host sex']:
+                if VALUE_text.lower() not in ['missing', 'not provided']:
+                    if VALUE_text in ['male', 'female']:
+                        info_for_yaml_dict['host']['host_sex'] = "http://purl.obolibrary.org/obo/PATO_0000384" if VALUE_text == 'male' else "http://purl.obolibrary.org/obo/PATO_0000383"
+                    else:
+                        missing_value_list.append('\t'.join([accession, 'host_sex', VALUE_text]))
+            elif TAG_text in ['host_age', 'host age']:
+                if is_integer(VALUE_text):
+                    info_for_yaml_dict['host']['host_age'] = VALUE_text
+                    info_for_yaml_dict['host']['host_age_unit'] = 'http://purl.obolibrary.org/obo/UO_0000036'
+            elif TAG_text == 'collected_by':
+                if VALUE_text.lower() not in ['not available', 'missing']:
+                    name = VALUE_text in ['Dr. Susie Bartlett', 'Ahmed Babiker', 'Aisi Fu', 'Brandi Williamson', 'George Taiaroa', 'Natacha Ogando', 'Tim Dalebout', 'ykut Ozdarendeli']
+                    
+                    info_for_yaml_dict['sample']['collector_name' if name else 'collecting_institution'] = VALUE_text
+            elif TAG_text == 'collecting institution':
+                if VALUE_text.lower() not in ['not provided', 'na']:
+                    info_for_yaml_dict['sample']['collecting_institution'] = VALUE_text
+            elif TAG_text in ['collection_date', 'collection date']:
+                if VALUE_text.lower() not in ['not applicable', 'missing', 'na']:
+                    date_to_write = VALUE_text
+                    date_is_estimated = True
+                    
+                    VALUE_text_list = VALUE_text.split('-')
+                    if len(VALUE_text_list) == 3:
+                        date_is_estimated = False
+                        
+                        if VALUE_text_list[1].isalpha():
+                            date_to_write = parse(VALUE_text).strftime('%Y-%m-%d')
+                    elif len(VALUE_text_list) == 2:
+                        date_to_write =  VALUE_text + '-15'
+                    else:
+                        if int(VALUE_text) < 2020:
+                            date_to_write = "{}-12-15".format(VALUE_text)
+                        else:
+                            date_to_write = "{}-01-15".format(VALUE_text)
+
+                    info_for_yaml_dict['sample']['collection_date'] = date_to_write
+                    
+                    if date_is_estimated:
+                        if 'additional_collection_information' in info_for_yaml_dict['sample']:
+                            info_for_yaml_dict['sample']['additional_collection_information'] += "; The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
+                        else:
+                            info_for_yaml_dict['sample']['additional_collection_information'] = "The 'collection_date' is estimated (the original date was: {})".format(VALUE_text)
+            elif TAG_text == 'geo_loc_name':
+                if VALUE_text in term_to_uri_dict:
+                    info_for_yaml_dict['sample']['collection_location'] = term_to_uri_dict[VALUE_text]
+                elif VALUE_text.lower() not in ['na', 'not applicable']:
+                    missing_value_list.append('\t'.join([accession, 'geo_loc_name', VALUE_text]))
+            #else:
+            #    if TAG_text not in ['lat_lon', 'host_disease', 'BioSampleModel', 'passage_history']:
+            #        print(accession, TAG_text, VALUE_text)
+
+
+    taxon_id = SAMPLE.find('SAMPLE_NAME').find('TAXON_ID').text
+    info_for_yaml_dict['virus']['virus_species'] = "http://purl.obolibrary.org/obo/NCBITaxon_"+taxon_id
+    
+    
+    EXPERIMENT = EXPERIMENT_PACKAGE.find('EXPERIMENT')
+    INSTRUMENT_MODEL = [x.text for x in EXPERIMENT.find('PLATFORM').iter('INSTRUMENT_MODEL')][0]
+    if INSTRUMENT_MODEL.lower() != 'unspecified':
+        if INSTRUMENT_MODEL in term_to_uri_dict:
+            info_for_yaml_dict['technology']['sample_sequencing_technology'] = [term_to_uri_dict[INSTRUMENT_MODEL]]
+        else:
+            missing_value_list.append('\t'.join([accession, 'sample_sequencing_technology', INSTRUMENT_MODEL]))
+    
+    LIBRARY_DESCRIPTOR = EXPERIMENT.find('DESIGN').find('LIBRARY_DESCRIPTOR')
+    if LIBRARY_DESCRIPTOR.text not in ['OTHER']:
+        info_for_yaml_dict['technology']['additional_technology_information'] = 'LIBRARY_STRATEGY: {};'.format(LIBRARY_DESCRIPTOR.find('LIBRARY_STRATEGY').text)
+           
+            
+    SUBMISSION = EXPERIMENT_PACKAGE.find('SUBMISSION')
+    info_for_yaml_dict['submitter']['submitter_sample_id'] = SUBMISSION.attrib['accession']
+    
+    if SUBMISSION.attrib['lab_name'].lower() not in ['na']:
+        info_for_yaml_dict['submitter']['originating_lab'] = SUBMISSION.attrib['lab_name']
+
+    STUDY = EXPERIMENT_PACKAGE.find('STUDY')     
+    info_for_yaml_dict['submitter']['publication'] = SUBMISSION.attrib['lab_name']
+    
+    
+    Organization = EXPERIMENT_PACKAGE.find('Organization')
+    Organization_Name = Organization.find('Name')
+    info_for_yaml_dict['submitter']['authors'] = [Organization_Name.text]
+        
+    Organization_Contact = Organization.find('Contact')
+    if Organization_Contact is not None:
+        Organization_Contact_Name = Organization_Contact.find('Name')
+        info_for_yaml_dict['submitter']['submitter_name'] = [Organization_Contact_Name.find('First').text + ' ' + Organization_Contact_Name.find('Last').text]
+        info_for_yaml_dict['submitter']['additional_submitter_information'] = Organization_Contact.attrib['email']
+
+        Organization_Concact_Address = Organization_Contact.find('Address')
+        if Organization_Concact_Address is not None:
+            info_for_yaml_dict['submitter']['submitter_address'] = '; '.join([x.text for x in Organization_Concact_Address] + ['Postal code ' + Organization_Concact_Address.attrib['postal_code']])
+
+    Organization_Address = Organization.find('Address')
+    if Organization_Address is not None:
+        info_for_yaml_dict['submitter']['lab_address'] = '; '.join([x.text for x in Organization_Address] + ['Postal code ' + Organization_Address.attrib['postal_code']])
+    
+    if 'collection_date' not in info_for_yaml_dict['sample']:
+        info_for_yaml_dict['sample']['collection_date'] = '1970-01-01'
+        info_for_yaml_dict['sample']['additional_collection_information'] = "The real 'collection_date' is missing"
+
+    with open(os.path.join(dir_yaml, '{}.yaml'.format(accession)), 'w') as fw:
+        json.dump(info_for_yaml_dict, fw, indent=2)
+    
+if len(missing_value_list) > 0:
+    path_missing_terms_tsv = 'missing_terms.tsv'
+    print('Written missing terms in {}'.format(path_missing_terms_tsv))
+    with open(path_missing_terms_tsv, 'w') as fw:
+        fw.write('\n'.join(missing_value_list))
\ No newline at end of file
-- 
cgit 1.4.1


From 2d822c049f2b7f3e62a3ac4c19c827428b6f73e2 Mon Sep 17 00:00:00 2001
From: Andrea Guarracino
Date: Mon, 22 Jun 2020 23:38:22 +0200
Subject: very little readme for the scripts

---
 scripts/README.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 scripts/README.md

diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..a3ed1b6
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,6 @@
+### Instructions for download and/or prepare the data and/ or the metadata
+
+Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside.
+
+- `download_genbank_data/from_genbank_to_fasta_and_yaml.py` downloads the data and the matadata, preparing the FASTA and the YAML files;
+- `download_sra_data/download_sra_data.py` creates the metadata in the form of YAML files from the SraExperimentPackage.XXX.xml.gz file in the same directory.
-- 
cgit 1.4.1