### pyhegp --- Homomorphic encryption of genotypes and phenotypes ### Copyright © 2025 Arun Isaac ### ### This file is part of pyhegp. ### ### pyhegp is free software: you can redistribute it and/or modify it ### under the terms of the GNU General Public License as published by ### the Free Software Foundation, either version 3 of the License, or ### (at your option) any later version. ### ### pyhegp is distributed in the hope that it will be useful, but ### WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ### General Public License for more details. ### ### You should have received a copy of the GNU General Public License ### along with pyhegp. If not, see . from hypothesis import strategies as st from hypothesis.extra.pandas import column, columns, data_frames, range_indexes import pandas as pd from scipy.stats import special_ortho_group from pyhegp.serialization import Summary, is_genotype_metadata_column, is_phenotype_metadata_column from pyhegp.utils import negate tabless_printable_ascii_text = st.text( # Exclude control characters and tab. st.characters(codec="ascii", exclude_categories=("Cc",), exclude_characters=("\t",)), min_size=1) chromosome_column = column(name="chromosome", dtype="str", elements=tabless_printable_ascii_text) position_column = column(name="position", dtype="int") reference_column = column(name="reference", dtype="str", elements=st.text( st.characters(codec="ascii", categories=(), include_characters=("A", "G", "C", "T")), min_size=1)) sample_names = (tabless_printable_ascii_text .filter(negate(is_genotype_metadata_column))) def genotype_metadata(draw, number_of_snps, reference_present): return draw(data_frames( columns=([chromosome_column, position_column] + ([reference_column] if reference_present else [])), index=range_indexes(min_size=number_of_snps, max_size=number_of_snps))) @st.composite def summaries(draw): stats = draw(data_frames( columns=columns(["mean", "std"], dtype="float64", elements=st.floats(allow_nan=False)))) return Summary(draw(st.integers()), pd.concat((genotype_metadata(draw, len(stats), draw(st.booleans())), stats), axis="columns")) @st.composite def genotype_frames(draw, number_of_samples=st.integers(min_value=0, max_value=10), reference_present=st.booleans()): _number_of_samples = draw(number_of_samples) dosages = draw(data_frames( columns=columns(draw(st.lists(sample_names, min_size=_number_of_samples, max_size=_number_of_samples, unique=True)), dtype="float64", elements=st.floats(min_value=0, max_value=100, allow_nan=False)))) genotype = pd.concat((genotype_metadata(draw, len(dosages), draw(reference_present)), dosages), axis="columns") return genotype.drop_duplicates(subset=list( filter(is_genotype_metadata_column, genotype.columns)), ignore_index=True) phenotype_names = st.lists(tabless_printable_ascii_text .filter(negate(is_phenotype_metadata_column)), unique=True) @st.composite def phenotype_frames(draw, number_of_samples=st.integers(min_value=0, max_value=10)): _number_of_samples = draw(number_of_samples) return draw(data_frames( columns=([column(name="sample-id", dtype="str", elements=tabless_printable_ascii_text, unique=True)] + columns(draw(phenotype_names), dtype="float64", elements=st.floats(min_value=-1000, max_value=1000, allow_nan=False))), index=range_indexes(min_size=_number_of_samples, max_size=_number_of_samples))) @st.composite def keys(draw, size): return (special_ortho_group(draw(size), seed=draw(st.integers(min_value=0, max_value=2**32-1))) .rvs())