diff options
| author | Arun Isaac | 2025-09-01 16:37:10 +0100 | 
|---|---|---|
| committer | Arun Isaac | 2025-09-01 16:51:08 +0100 | 
| commit | e252fca465f34e7672d7fdd2ce32d7eeef3e59f7 (patch) | |
| tree | d3b9f079c741471c4fe5b3e1c4863797eacdf3f5 | |
| parent | 452501245c623df38f462fa91afef1f4c0d798d6 (diff) | |
| download | pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.gz pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.tar.lz pyhegp-e252fca465f34e7672d7fdd2ce32d7eeef3e59f7.zip | |
Pass dtype to read_csv.
read_csv can incorrectly infer that the string "00" is the integer 0. To avoid this ambiguity, pass the correct dtype to read_csv.
| -rw-r--r-- | pyhegp/serialization.py | 9 | 
1 files changed, 6 insertions, 3 deletions
| diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index 799109e..c86d216 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -66,8 +66,9 @@ def write_summary(file, summary): float_format="%.8g", index=False)) -def read_tsv(file): +def read_tsv(file, dtype): return pd.read_csv(file, + dtype=dtype, quoting=csv.QUOTE_NONE, sep="\t", na_filter=False, @@ -78,7 +79,9 @@ def read_tsv(file): skip_blank_lines=False) def read_genotype(file): - df = read_tsv(file) + df = read_tsv(file, {"chromosome": "str", + "position": "int", + "reference": "str"}) sample_columns = [column for column in df.columns if column not in ["chromosome", "position", "reference"]] @@ -90,7 +93,7 @@ def read_genotype(file): return df def read_phenotype(file): - df = read_tsv(file) + df = read_tsv(file, {"sample-id": "str"}) phenotype_columns = [column for column in df.columns if column != "sample-id"] | 
