From e252fca465f34e7672d7fdd2ce32d7eeef3e59f7 Mon Sep 17 00:00:00 2001
From: Arun Isaac
Date: Mon, 1 Sep 2025 16:37:10 +0100
Subject: Pass dtype to read_csv.

read_csv can incorrectly infer that the string "00" is the integer 0.
To avoid this ambiguity, pass the correct dtype to read_csv.
---
 pyhegp/serialization.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index 799109e..c86d216 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -66,8 +66,9 @@ def write_summary(file, summary):
              float_format="%.8g",
              index=False))
 
-def read_tsv(file):
+def read_tsv(file, dtype):
     return pd.read_csv(file,
+                       dtype=dtype,
                        quoting=csv.QUOTE_NONE,
                        sep="\t",
                        na_filter=False,
@@ -78,7 +79,9 @@ def read_tsv(file):
                        skip_blank_lines=False)
 
 def read_genotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"chromosome": "str",
+                         "position": "int",
+                         "reference": "str"})
     sample_columns = [column
                       for column in df.columns
                       if column not in ["chromosome", "position", "reference"]]
@@ -90,7 +93,7 @@ def read_genotype(file):
     return df
 
 def read_phenotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"sample-id": "str"})
     phenotype_columns = [column
                          for column in df.columns
                          if column != "sample-id"]
-- 
cgit 1.4.1