bh20-seq-resource - Tool to upload SARS-CoV-2 sequences to BH20 Arvados instance and orchestrate analysis

import os


def is_integer(string_to_check):
    try:
        int(string_to_check)
        return True
    except ValueError:
        return False


def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
    """
    Check duplicated entry by looking in all dictionaries
    """

    field_to_term_to_uri_dict = {}

    path_dict_xxx_csv_list = [
        os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
        os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
    ]

    for path_dict_xxx_csv in path_dict_xxx_csv_list:
        print(f'Read {path_dict_xxx_csv}')

        with open(path_dict_xxx_csv) as f:
            for line in f:
                if len(line.split(',')) > 2:
                    term, uri = line.strip('\n').split('",')
                else:
                    term, uri = line.strip('\n').split(',')

                term = term.strip('"')

                if term in field_to_term_to_uri_dict:
                    print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
                    continue

                field_to_term_to_uri_dict[term] = uri

    # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen)
    field_to_term_to_uri_dict = {}

    for path_dict_xxx_csv in path_dict_xxx_csv_list:
        field = os.path.basename(path_dict_xxx_csv).split('.')[0]

        field_to_term_to_uri_dict[field] = {}

        with open(path_dict_xxx_csv) as f:
            for line in f:
                if len(line.split(',')) > 2:
                    term, uri = line.strip('\n').split('",')
                else:
                    term, uri = line.strip('\n').split(',')

                term = term.strip('"')

                if term in field_to_term_to_uri_dict[field]:
                    print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
                    continue

                field_to_term_to_uri_dict[field][term] = uri

    return field_to_term_to_uri_dict