From a76dedb432aa99f53681144c488ac19b9c23e660 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Wed, 14 Jan 2026 20:29:33 +0000 Subject: Add intercept column in phenotypes file. --- doc/file-formats.md | 6 +++++- pyhegp/pyhegp.py | 11 +++++++---- tests/helpers/strategies.py | 12 ++++++++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/file-formats.md b/doc/file-formats.md index 01263e3..04fae0c 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -49,7 +49,11 @@ chr11 3464016 A -0.3461 -0.334 -0.3331 0.08 ## phenotype (and covariates) file -The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. +The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. + +The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. + +Encryption MAY add a column labelled `intercept` containing the ciphertext corresponding to a vector of 1s. Here is an example phenotype file. ``` diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 3694ce4..2a82690 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -131,11 +131,14 @@ def encrypt_genotype(genotype, key, summary): def encrypt_phenotype(phenotype, key): phenotype_matrix = phenotype.drop(columns=["sample-id"]) - sample_names = phenotype_matrix.columns + sample_names = list(phenotype_matrix.columns) return pd.concat((phenotype["sample-id"], - pd.DataFrame(hegp_encrypt(phenotype_matrix.to_numpy(), - key), - columns=sample_names)), + pd.DataFrame( + hegp_encrypt( + np.column_stack((np.ones(len(phenotype)), + phenotype_matrix.to_numpy())), + key), + columns=["intercept"] + sample_names)), axis="columns") def cat_genotype(genotypes): diff --git a/tests/helpers/strategies.py b/tests/helpers/strategies.py index 3771ed0..171c721 100644 --- a/tests/helpers/strategies.py +++ b/tests/helpers/strategies.py @@ -1,5 +1,5 @@ ### pyhegp --- Homomorphic encryption of genotypes and phenotypes -### Copyright © 2025 Arun Isaac +### Copyright © 2025–2026 Arun Isaac ### ### This file is part of pyhegp. ### @@ -104,13 +104,21 @@ phenotype_names = st.lists(tabless_printable_ascii_text @st.composite def phenotype_frames(draw, number_of_samples=st.integers(min_value=0, - max_value=10)): + max_value=10), + intercept_present=st.booleans()): _number_of_samples = draw(number_of_samples) return draw(data_frames( columns=([column(name="sample-id", dtype="str", elements=tabless_printable_ascii_text, unique=True)] + + ([column(name="intercept", + dtype="float64", + elements=st.floats(min_value=-1, + max_value=1, + allow_nan=False))] + if draw(intercept_present) + else []) + columns(draw(phenotype_names), dtype="float64", elements=st.floats(min_value=-1000, -- cgit 1.4.1