Pass dtype to read_csv.

read_csv can incorrectly infer that the string "00" is the integer 0. To avoid this ambiguity, pass the correct dtype to read_csv.
author: Arun Isaac 2025-09-01 16:37:10 +0100
committer: Arun Isaac 2025-09-01 16:51:08 +0100
commit: e252fca465f34e7672d7fdd2ce32d7eeef3e59f7 (patch)
tree: d3b9f079c741471c4fe5b3e1c4863797eacdf3f5
parent: 452501245c623df38f462fa91afef1f4c0d798d6 (diff)
download: pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.gz
pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.lz
pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.zip
1 files changed, 6 insertions, 3 deletions
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index 799109e..c86d216 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -66,8 +66,9 @@ def write_summary(file, summary):
              float_format="%.8g",
              index=False))
 
-def read_tsv(file):
+def read_tsv(file, dtype):
     return pd.read_csv(file,
+                       dtype=dtype,
                        quoting=csv.QUOTE_NONE,
                        sep="\t",
                        na_filter=False,
@@ -78,7 +79,9 @@ def read_tsv(file):
                        skip_blank_lines=False)
 
 def read_genotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"chromosome": "str",
+                         "position": "int",
+                         "reference": "str"})
     sample_columns = [column
                       for column in df.columns
                       if column not in ["chromosome", "position", "reference"]]
@@ -90,7 +93,7 @@ def read_genotype(file):
     return df
 
 def read_phenotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"sample-id": "str"})
     phenotype_columns = [column
                          for column in df.columns
                          if column != "sample-id"]
author	Arun Isaac	2025-09-01 16:37:10 +0100
committer	Arun Isaac	2025-09-01 16:51:08 +0100
commit	e252fca465f34e7672d7fdd2ce32d7eeef3e59f7 (patch)
tree	d3b9f079c741471c4fe5b3e1c4863797eacdf3f5
parent	452501245c623df38f462fa91afef1f4c0d798d6 (diff)
download	pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.gz pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.lz pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.zip