### pyhegp --- Homomorphic encryption of genotypes and phenotypes ### Copyright © 2025 Arun Isaac ### ### This file is part of pyhegp. ### ### pyhegp is free software: you can redistribute it and/or modify it ### under the terms of the GNU General Public License as published by ### the Free Software Foundation, either version 3 of the License, or ### (at your option) any later version. ### ### pyhegp is distributed in the hope that it will be useful, but ### WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ### General Public License for more details. ### ### You should have received a copy of the GNU General Public License ### along with pyhegp. If not, see . from hypothesis import strategies as st from hypothesis.extra.pandas import column, columns, data_frames from scipy.stats import special_ortho_group from pyhegp.serialization import Summary, is_genotype_metadata_column, is_phenotype_metadata_column from pyhegp.utils import negate tabless_printable_ascii_text = st.text( # Exclude control characters and tab. st.characters(codec="ascii", exclude_categories=("Cc",), exclude_characters=("\t",)), min_size=1) chromosome_column = column(name="chromosome", dtype="str", elements=tabless_printable_ascii_text) position_column = column(name="position", dtype="int") reference_column = column(name="reference", dtype="str", elements=st.text( st.characters(codec="ascii", categories=(), include_characters=("A", "G", "C", "T")), min_size=1)) sample_names = (tabless_printable_ascii_text .filter(negate(is_genotype_metadata_column))) @st.composite def summaries(draw): return Summary(draw(st.integers()), draw(data_frames( columns=([chromosome_column, position_column] + ([reference_column] if draw(st.booleans()) else []) + columns(["mean", "std"], dtype="float64", elements=st.floats(allow_nan=False)))))) @st.composite def genotype_frames(draw, number_of_samples=st.integers(min_value=0, max_value=10), reference_present=st.booleans()): _number_of_samples = draw(number_of_samples) genotype = draw(data_frames( columns=([chromosome_column, position_column] + ([reference_column] if draw(reference_present) else []) + columns(draw(st.lists(sample_names, min_size=_number_of_samples, max_size=_number_of_samples, unique=True)), dtype="float64", elements=st.floats(allow_nan=False))))) return genotype.drop_duplicates(subset=list( filter(is_genotype_metadata_column, genotype.columns)), ignore_index=True) phenotype_names = st.lists(tabless_printable_ascii_text .filter(negate(is_phenotype_metadata_column)), unique=True) @st.composite def phenotype_frames(draw): phenotype = draw(data_frames( columns=([column(name="sample-id", dtype="str", elements=tabless_printable_ascii_text)] + columns(draw(phenotype_names), dtype="float64", elements=st.floats(allow_nan=False))))) return phenotype.drop_duplicates(subset=list( filter(is_phenotype_metadata_column, phenotype.columns)), ignore_index=True) @st.composite def keys(draw, size): return (special_ortho_group(draw(size), seed=draw(st.integers(min_value=0, max_value=2**32-1))) .rvs())