diff options
author | Pjotr Prins | 2021-01-01 12:27:00 +0000 |
---|---|---|
committer | Pjotr Prins | 2021-01-01 12:27:00 +0000 |
commit | a86633121cd92e37b2353f4bba8f2c37fa970d05 (patch) | |
tree | 5bd64dc5c71633fc842dc3092f7fabef24750ca4 /workflows/pull-data/genbank | |
parent | 2c6181c7eb86c0285928a434a37401d6680f9f79 (diff) | |
download | bh20-seq-resource-a86633121cd92e37b2353f4bba8f2c37fa970d05.tar.gz bh20-seq-resource-a86633121cd92e37b2353f4bba8f2c37fa970d05.tar.lz bh20-seq-resource-a86633121cd92e37b2353f4bba8f2c37fa970d05.zip |
genbank-fetch-ids.py
Diffstat (limited to 'workflows/pull-data/genbank')
-rw-r--r-- | workflows/pull-data/genbank/README.md | 2 | ||||
-rw-r--r-- | workflows/pull-data/genbank/utils.py | 62 |
2 files changed, 63 insertions, 1 deletions
diff --git a/workflows/pull-data/genbank/README.md b/workflows/pull-data/genbank/README.md index f442b5d..d7c294b 100644 --- a/workflows/pull-data/genbank/README.md +++ b/workflows/pull-data/genbank/README.md @@ -4,7 +4,7 @@ # --- get list of IDs already in PubSeq sparql-fetch-ids > pubseq_ids.txt # --- get list of missing genbank IDs -genbank-fetch-ids --skip pubseq_ids.txt > genbank_ids.txt +genbank-fetch-ids.py --skip pubseq_ids.txt > genbank_ids.txt # --- fetch XML update-from-genbank.py --ids genbank_ids.txt --outdir ~/tmp/genbank # --- Transform to YAML and FASTA diff --git a/workflows/pull-data/genbank/utils.py b/workflows/pull-data/genbank/utils.py new file mode 100644 index 0000000..3efc67a --- /dev/null +++ b/workflows/pull-data/genbank/utils.py @@ -0,0 +1,62 @@ +import os + +def is_integer(string_to_check): + try: + int(string_to_check) + return True + except ValueError: + return False + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): + # Check duplicated entry looking at all dictionaries + field_to_term_to_uri_dict = {} + + path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in + os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] + + for path_dict_xxx_csv in path_dict_xxx_csv_list: + print('Read {}'.format(path_dict_xxx_csv)) + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in field_to_term_to_uri_dict: + print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) + continue + + field_to_term_to_uri_dict[term] = uri + + # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen) + field_to_term_to_uri_dict = {} + + for path_dict_xxx_csv in path_dict_xxx_csv_list: + field = os.path.basename(path_dict_xxx_csv).split('.')[0] + + field_to_term_to_uri_dict[field] = {} + + with open(path_dict_xxx_csv) as f: + for line in f: + if len(line.split(',')) > 2: + term, uri = line.strip('\n').split('",') + else: + term, uri = line.strip('\n').split(',') + + term = term.strip('"') + + if term in field_to_term_to_uri_dict[field]: + print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) + continue + + field_to_term_to_uri_dict[field][term] = uri + + return field_to_term_to_uri_dict
\ No newline at end of file |