diff options
| author | Arun Isaac | 2026-01-14 20:29:33 +0000 |
|---|---|---|
| committer | Arun Isaac | 2026-01-16 23:06:32 +0000 |
| commit | a76dedb432aa99f53681144c488ac19b9c23e660 (patch) | |
| tree | 17ba9a6502c0f71a4355f0b8210666b65749ea15 | |
| parent | 1e6efe6ef73cbb65839a95883899442f5dc4afe1 (diff) | |
| download | pyhegp-a76dedb432aa99f53681144c488ac19b9c23e660.tar.gz pyhegp-a76dedb432aa99f53681144c488ac19b9c23e660.tar.lz pyhegp-a76dedb432aa99f53681144c488ac19b9c23e660.zip | |
Add intercept column in phenotypes file.
| -rw-r--r-- | doc/file-formats.md | 6 | ||||
| -rw-r--r-- | pyhegp/pyhegp.py | 11 | ||||
| -rw-r--r-- | tests/helpers/strategies.py | 12 |
3 files changed, 22 insertions, 7 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md index 01263e3..04fae0c 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -49,7 +49,11 @@ chr11 3464016 A -0.3461 -0.334 -0.3331 0.08 ## phenotype (and covariates) file -The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. +The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. + +The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. + +Encryption MAY add a column labelled `intercept` containing the ciphertext corresponding to a vector of 1s. Here is an example phenotype file. ``` diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 3694ce4..2a82690 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -131,11 +131,14 @@ def encrypt_genotype(genotype, key, summary): def encrypt_phenotype(phenotype, key): phenotype_matrix = phenotype.drop(columns=["sample-id"]) - sample_names = phenotype_matrix.columns + sample_names = list(phenotype_matrix.columns) return pd.concat((phenotype["sample-id"], - pd.DataFrame(hegp_encrypt(phenotype_matrix.to_numpy(), - key), - columns=sample_names)), + pd.DataFrame( + hegp_encrypt( + np.column_stack((np.ones(len(phenotype)), + phenotype_matrix.to_numpy())), + key), + columns=["intercept"] + sample_names)), axis="columns") def cat_genotype(genotypes): diff --git a/tests/helpers/strategies.py b/tests/helpers/strategies.py index 3771ed0..171c721 100644 --- a/tests/helpers/strategies.py +++ b/tests/helpers/strategies.py @@ -1,5 +1,5 @@ ### pyhegp --- Homomorphic encryption of genotypes and phenotypes -### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +### Copyright © 2025–2026 Arun Isaac <arunisaac@systemreboot.net> ### ### This file is part of pyhegp. ### @@ -104,13 +104,21 @@ phenotype_names = st.lists(tabless_printable_ascii_text @st.composite def phenotype_frames(draw, number_of_samples=st.integers(min_value=0, - max_value=10)): + max_value=10), + intercept_present=st.booleans()): _number_of_samples = draw(number_of_samples) return draw(data_frames( columns=([column(name="sample-id", dtype="str", elements=tabless_printable_ascii_text, unique=True)] + + ([column(name="intercept", + dtype="float64", + elements=st.floats(min_value=-1, + max_value=1, + allow_nan=False))] + if draw(intercept_present) + else []) + columns(draw(phenotype_names), dtype="float64", elements=st.floats(min_value=-1000, |
