Add phenotype file format and serialization functions.

author: Arun Isaac 2025-09-01 13:38:01 +0100
committer: Arun Isaac 2025-09-01 15:04:30 +0100
commit: 48cd79a75f274a2bdffbefc508e2a0e4be96f34d (patch)
tree: 4f81f15a4d7fd4d76eaab4b8383bc656e57945a0
parent: b2826e3bf5809b80dfb99286cf806361e7bfcbae (diff)
download: pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.gz
pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.lz
pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.zip
3 files changed, 71 insertions, 12 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md
index 52ccd0a..8515b38 100644
--- a/doc/file-formats.md
+++ b/doc/file-formats.md
@@ -47,6 +47,27 @@ chr11	3462348	T	-0.361	-0.3244	-0.326	0.0986
 chr11	3464016	A	-0.3461	-0.334	-0.3331	0.08
 ```
 
+## phenotype (and covariates) file
+
+The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive.
+
+Here is an example phenotype file.
+```
+sample-id	sex	start-weight	end-weight	weight-growth-slope	glucose-weight
+A063361614	1	-0.07717	0.04008	0.09186	0.0006918
+A084271874	1	-0.09059	-0.1205	-0.02113	-0.1259
+A048102344	1	-0.08586	-0.04292	0.03724	-0.05113
+A048120064	1	-0.09509	-0.04183	-0.0003283	-0.05509
+A083583043	1	0.06565	0.02807	-0.02835	0.02076
+A063014029	1	-0.1015	-0.113	-0.02167	-0.1116
+A063001812	1	0.1294	0.1109	0.0217	0.1099
+A063873017	1	0.1711	0.1519	0.0116	0.1564
+A066790098	1	0.04236	0.2847	0.1172	0.2834
+A053042270	1	0.1332	0.1758	0.05178	0.2161
+```
+
+Phenotype files may also be used to store covariates.
+
 ## key file
 
 The key file is a tab-separated values (TSV) file with numerical data. There MUST be no column headers.
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index a17c966..b234a5b 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -66,11 +66,14 @@ def write_summary(file, summary):
              float_format="%.8g",
              index=False))
 
+def read_tsv(file):
+    return pd.read_csv(file,
+                       quoting=csv.QUOTE_NONE,
+                       sep="\t",
+                       na_filter=False)
+
 def read_genotype(file):
-    df = pd.read_csv(file,
-                     quoting=csv.QUOTE_NONE,
-                     sep="\t",
-                     na_filter=False)
+    df = read_tsv(file)
     sample_columns = [column
                       for column in df.columns
                       if column not in ["chromosome", "position", "reference"]]
@@ -81,13 +84,24 @@ def read_genotype(file):
     df[sample_columns] = df[sample_columns].astype("float")
     return df
 
-def write_genotype(file, genotype):
-    (genotype
-     .to_csv(file,
-             quoting=csv.QUOTE_NONE,
-             sep="\t",
-             float_format="%.8g",
-             index=False))
+def read_phenotype(file):
+    df = read_tsv(file)
+    phenotype_columns = [column
+                         for column in df.columns
+                         if column != "sample-id"]
+    df["sample-id"] = df["sample-id"].astype("str")
+    df[phenotype_columns] = df[phenotype_columns].astype("float")
+    return df
+
+def write_tsv(file, df):
+    df.to_csv(file,
+              quoting=csv.QUOTE_NONE,
+              sep="\t",
+              float_format="%.8g",
+              index=False)
+
+write_genotype = write_tsv
+write_phenotype = write_tsv
 
 def read_key(file):
     return np.loadtxt(file, delimiter="\t", ndmin=2)
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index a473796..c856094 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -24,7 +24,7 @@ from hypothesis.extra.pandas import column, columns, data_frames
 import pandas as pd
 from pytest import approx
 
-from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_key, write_key
+from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_phenotype, write_phenotype, read_key, write_key
 from pyhegp.utils import negate
 
 tabless_printable_ascii_text = st.text(
@@ -118,6 +118,30 @@ def test_read_write_genotype_are_inverses(genotype):
         file.seek(0)
         pd.testing.assert_frame_equal(genotype, read_genotype(file))
 
+def phenotype_reserved_column_name_p(name):
+    return name.lower() == "sample-id"
+
+phenotype_names = st.lists(tabless_printable_ascii_text
+                           .filter(negate(phenotype_reserved_column_name_p)),
+                           unique=True)
+
+@st.composite
+def phenotype_frames(draw):
+    return draw(data_frames(
+        columns=([column(name="sample-id",
+                         dtype="str",
+                         elements=tabless_printable_ascii_text)]
+                 + columns(draw(phenotype_names),
+                           dtype="float64",
+                           elements=st.floats(allow_nan=False)))))
+
+@given(phenotype_frames())
+def test_read_write_phenotype_are_inverses(phenotype):
+    with tempfile.TemporaryFile() as file:
+        write_phenotype(file, phenotype)
+        file.seek(0)
+        pd.testing.assert_frame_equal(phenotype, read_phenotype(file))
+
 @given(arrays("float64",
               array_shapes(min_dims=2, max_dims=2)))
 def test_read_write_key_are_inverses(key):
author	Arun Isaac	2025-09-01 13:38:01 +0100
committer	Arun Isaac	2025-09-01 15:04:30 +0100
commit	48cd79a75f274a2bdffbefc508e2a0e4be96f34d (patch)
tree	4f81f15a4d7fd4d76eaab4b8383bc656e57945a0
parent	b2826e3bf5809b80dfb99286cf806361e7bfcbae (diff)
download	pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.gz pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.lz pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.zip