about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2025-09-01 16:37:10 +0100
committerArun Isaac2025-09-01 16:51:08 +0100
commite252fca465f34e7672d7fdd2ce32d7eeef3e59f7 (patch)
treed3b9f079c741471c4fe5b3e1c4863797eacdf3f5
parent452501245c623df38f462fa91afef1f4c0d798d6 (diff)
downloadpyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.gz
pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.lz
pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.zip
Pass dtype to read_csv.
read_csv can incorrectly infer that the string "00" is the integer 0.
To avoid this ambiguity, pass the correct dtype to read_csv.
-rw-r--r--pyhegp/serialization.py9
1 files changed, 6 insertions, 3 deletions
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index 799109e..c86d216 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -66,8 +66,9 @@ def write_summary(file, summary):
              float_format="%.8g",
              index=False))
 
-def read_tsv(file):
+def read_tsv(file, dtype):
     return pd.read_csv(file,
+                       dtype=dtype,
                        quoting=csv.QUOTE_NONE,
                        sep="\t",
                        na_filter=False,
@@ -78,7 +79,9 @@ def read_tsv(file):
                        skip_blank_lines=False)
 
 def read_genotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"chromosome": "str",
+                         "position": "int",
+                         "reference": "str"})
     sample_columns = [column
                       for column in df.columns
                       if column not in ["chromosome", "position", "reference"]]
@@ -90,7 +93,7 @@ def read_genotype(file):
     return df
 
 def read_phenotype(file):
-    df = read_tsv(file)
+    df = read_tsv(file, {"sample-id": "str"})
     phenotype_columns = [column
                          for column in df.columns
                          if column != "sample-id"]