diff options
author | Arun Isaac | 2025-09-01 13:38:01 +0100 |
---|---|---|
committer | Arun Isaac | 2025-09-01 15:04:30 +0100 |
commit | 48cd79a75f274a2bdffbefc508e2a0e4be96f34d (patch) | |
tree | 4f81f15a4d7fd4d76eaab4b8383bc656e57945a0 | |
parent | b2826e3bf5809b80dfb99286cf806361e7bfcbae (diff) | |
download | pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.gz pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.tar.lz pyhegp-48cd79a75f274a2bdffbefc508e2a0e4be96f34d.zip |
Add phenotype file format and serialization functions.
-rw-r--r-- | doc/file-formats.md | 21 | ||||
-rw-r--r-- | pyhegp/serialization.py | 36 | ||||
-rw-r--r-- | tests/test_serialization.py | 26 |
3 files changed, 71 insertions, 12 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md index 52ccd0a..8515b38 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -47,6 +47,27 @@ chr11 3462348 T -0.361 -0.3244 -0.326 0.0986 chr11 3464016 A -0.3461 -0.334 -0.3331 0.08 ``` +## phenotype (and covariates) file + +The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. + +Here is an example phenotype file. +``` +sample-id sex start-weight end-weight weight-growth-slope glucose-weight +A063361614 1 -0.07717 0.04008 0.09186 0.0006918 +A084271874 1 -0.09059 -0.1205 -0.02113 -0.1259 +A048102344 1 -0.08586 -0.04292 0.03724 -0.05113 +A048120064 1 -0.09509 -0.04183 -0.0003283 -0.05509 +A083583043 1 0.06565 0.02807 -0.02835 0.02076 +A063014029 1 -0.1015 -0.113 -0.02167 -0.1116 +A063001812 1 0.1294 0.1109 0.0217 0.1099 +A063873017 1 0.1711 0.1519 0.0116 0.1564 +A066790098 1 0.04236 0.2847 0.1172 0.2834 +A053042270 1 0.1332 0.1758 0.05178 0.2161 +``` + +Phenotype files may also be used to store covariates. + ## key file The key file is a tab-separated values (TSV) file with numerical data. There MUST be no column headers. diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index a17c966..b234a5b 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -66,11 +66,14 @@ def write_summary(file, summary): float_format="%.8g", index=False)) +def read_tsv(file): + return pd.read_csv(file, + quoting=csv.QUOTE_NONE, + sep="\t", + na_filter=False) + def read_genotype(file): - df = pd.read_csv(file, - quoting=csv.QUOTE_NONE, - sep="\t", - na_filter=False) + df = read_tsv(file) sample_columns = [column for column in df.columns if column not in ["chromosome", "position", "reference"]] @@ -81,13 +84,24 @@ def read_genotype(file): df[sample_columns] = df[sample_columns].astype("float") return df -def write_genotype(file, genotype): - (genotype - .to_csv(file, - quoting=csv.QUOTE_NONE, - sep="\t", - float_format="%.8g", - index=False)) +def read_phenotype(file): + df = read_tsv(file) + phenotype_columns = [column + for column in df.columns + if column != "sample-id"] + df["sample-id"] = df["sample-id"].astype("str") + df[phenotype_columns] = df[phenotype_columns].astype("float") + return df + +def write_tsv(file, df): + df.to_csv(file, + quoting=csv.QUOTE_NONE, + sep="\t", + float_format="%.8g", + index=False) + +write_genotype = write_tsv +write_phenotype = write_tsv def read_key(file): return np.loadtxt(file, delimiter="\t", ndmin=2) diff --git a/tests/test_serialization.py b/tests/test_serialization.py index a473796..c856094 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -24,7 +24,7 @@ from hypothesis.extra.pandas import column, columns, data_frames import pandas as pd from pytest import approx -from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_key, write_key +from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_phenotype, write_phenotype, read_key, write_key from pyhegp.utils import negate tabless_printable_ascii_text = st.text( @@ -118,6 +118,30 @@ def test_read_write_genotype_are_inverses(genotype): file.seek(0) pd.testing.assert_frame_equal(genotype, read_genotype(file)) +def phenotype_reserved_column_name_p(name): + return name.lower() == "sample-id" + +phenotype_names = st.lists(tabless_printable_ascii_text + .filter(negate(phenotype_reserved_column_name_p)), + unique=True) + +@st.composite +def phenotype_frames(draw): + return draw(data_frames( + columns=([column(name="sample-id", + dtype="str", + elements=tabless_printable_ascii_text)] + + columns(draw(phenotype_names), + dtype="float64", + elements=st.floats(allow_nan=False))))) + +@given(phenotype_frames()) +def test_read_write_phenotype_are_inverses(phenotype): + with tempfile.TemporaryFile() as file: + write_phenotype(file, phenotype) + file.seek(0) + pd.testing.assert_frame_equal(phenotype, read_phenotype(file)) + @given(arrays("float64", array_shapes(min_dims=2, max_dims=2))) def test_read_write_key_are_inverses(key): |