From e252fca465f34e7672d7fdd2ce32d7eeef3e59f7 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Mon, 1 Sep 2025 16:37:10 +0100 Subject: Pass dtype to read_csv. read_csv can incorrectly infer that the string "00" is the integer 0. To avoid this ambiguity, pass the correct dtype to read_csv. --- pyhegp/serialization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index 799109e..c86d216 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -66,8 +66,9 @@ def write_summary(file, summary): float_format="%.8g", index=False)) -def read_tsv(file): +def read_tsv(file, dtype): return pd.read_csv(file, + dtype=dtype, quoting=csv.QUOTE_NONE, sep="\t", na_filter=False, @@ -78,7 +79,9 @@ def read_tsv(file): skip_blank_lines=False) def read_genotype(file): - df = read_tsv(file) + df = read_tsv(file, {"chromosome": "str", + "position": "int", + "reference": "str"}) sample_columns = [column for column in df.columns if column not in ["chromosome", "position", "reference"]] @@ -90,7 +93,7 @@ def read_genotype(file): return df def read_phenotype(file): - df = read_tsv(file) + df = read_tsv(file, {"sample-id": "str"}) phenotype_columns = [column for column in df.columns if column != "sample-id"] -- cgit 1.4.1