genbank and sra scripts more picky on the ontologies; added utils.py for shared functions

author: AndreaGuarracino 2020-09-28 09:13:50 +0200
committer: AndreaGuarracino 2020-09-28 09:13:50 +0200
commit: bc2e51bc8418876cc826482ece10874b2a61fa03 (patch)
tree: f9d7fc6139d3928f6a8985c7dc1b2a2c579ea586 /scripts/utils.py
parent: e64c1084caf9a8fe0ebca1b01d9353533f65f2ee (diff)
download: bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.tar.gz
bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.tar.lz
bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.zip
1 files changed, 62 insertions, 0 deletions
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..3efc67a
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,62 @@
+import os
+
+def is_integer(string_to_check):
+    try:
+        int(string_to_check)
+        return True
+    except ValueError:
+        return False
+
+def chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
+    # Check duplicated entry looking at all dictionaries
+    field_to_term_to_uri_dict = {}
+
+    path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
+                              os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
+
+    for path_dict_xxx_csv in path_dict_xxx_csv_list:
+        print('Read {}'.format(path_dict_xxx_csv))
+
+        with open(path_dict_xxx_csv) as f:
+            for line in f:
+                if len(line.split(',')) > 2:
+                    term, uri = line.strip('\n').split('",')
+                else:
+                    term, uri = line.strip('\n').split(',')
+
+                term = term.strip('"')
+
+                if term in field_to_term_to_uri_dict:
+                    print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
+                    continue
+
+                field_to_term_to_uri_dict[term] = uri
+
+    # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen)
+    field_to_term_to_uri_dict = {}
+
+    for path_dict_xxx_csv in path_dict_xxx_csv_list:
+        field = os.path.basename(path_dict_xxx_csv).split('.')[0]
+
+        field_to_term_to_uri_dict[field] = {}
+
+        with open(path_dict_xxx_csv) as f:
+            for line in f:
+                if len(line.split(',')) > 2:
+                    term, uri = line.strip('\n').split('",')
+                else:
+                    term, uri = line.strip('\n').split(',')
+
+                term = term.strip('"')
+
+                if term in field_to_term_to_uri_dict[field]:
+                    print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
+                    continue
+
+                field_to_term_to_uri_dict[field][term] = uri
+
+    return field_to_term_to_uri_dict
\ No newline at end of file
author	AndreaGuarracino	2020-09-28 09:13:50 +0200
committer	AndreaGuarracino	2020-09-28 09:13:50 +0200
commit	bc2e51bc8418876cc826482ece10874b2a61fa03 (patch)
tree	f9d7fc6139d3928f6a8985c7dc1b2a2c579ea586 /scripts/utils.py
parent	e64c1084caf9a8fe0ebca1b01d9353533f65f2ee (diff)
download	bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.tar.gz bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.tar.lz bh20-seq-resource-bc2e51bc8418876cc826482ece10874b2a61fa03.zip