diff options
| -rw-r--r-- | doc/file-formats.md | 6 | ||||
| -rw-r--r-- | pyhegp/pyhegp.py | 11 | ||||
| -rw-r--r-- | tests/helpers/strategies.py | 12 |
3 files changed, 22 insertions, 7 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md index 01263e3..04fae0c 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -49,7 +49,11 @@ chr11 3464016 A -0.3461 -0.334 -0.3331 0.08 ## phenotype (and covariates) file -The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. +The phenotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one individual. + +The column labelled `sample-id` contains the sample identifier for that individual. Other columns each contain a phenotypical trait for that individual. The headers of these columns MUST be the names of the phenotypes. Column headers are case-sensitive. + +Encryption MAY add a column labelled `intercept` containing the ciphertext corresponding to a vector of 1s. Here is an example phenotype file. ``` diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 3694ce4..2a82690 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -131,11 +131,14 @@ def encrypt_genotype(genotype, key, summary): def encrypt_phenotype(phenotype, key): phenotype_matrix = phenotype.drop(columns=["sample-id"]) - sample_names = phenotype_matrix.columns + sample_names = list(phenotype_matrix.columns) return pd.concat((phenotype["sample-id"], - pd.DataFrame(hegp_encrypt(phenotype_matrix.to_numpy(), - key), - columns=sample_names)), + pd.DataFrame( + hegp_encrypt( + np.column_stack((np.ones(len(phenotype)), + phenotype_matrix.to_numpy())), + key), + columns=["intercept"] + sample_names)), axis="columns") def cat_genotype(genotypes): diff --git a/tests/helpers/strategies.py b/tests/helpers/strategies.py index 3771ed0..171c721 100644 --- a/tests/helpers/strategies.py +++ b/tests/helpers/strategies.py @@ -1,5 +1,5 @@ ### pyhegp --- Homomorphic encryption of genotypes and phenotypes -### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +### Copyright © 2025–2026 Arun Isaac <arunisaac@systemreboot.net> ### ### This file is part of pyhegp. ### @@ -104,13 +104,21 @@ phenotype_names = st.lists(tabless_printable_ascii_text @st.composite def phenotype_frames(draw, number_of_samples=st.integers(min_value=0, - max_value=10)): + max_value=10), + intercept_present=st.booleans()): _number_of_samples = draw(number_of_samples) return draw(data_frames( columns=([column(name="sample-id", dtype="str", elements=tabless_printable_ascii_text, unique=True)] + + ([column(name="intercept", + dtype="float64", + elements=st.floats(min_value=-1, + max_value=1, + allow_nan=False))] + if draw(intercept_present) + else []) + columns(draw(phenotype_names), dtype="float64", elements=st.floats(min_value=-1000, |
