tests/helpers/strategies.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

### pyhegp --- Homomorphic encryption of genotypes and phenotypes
### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
###
### This file is part of pyhegp.
###
### pyhegp is free software: you can redistribute it and/or modify it
### under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### pyhegp is distributed in the hope that it will be useful, but
### WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
### General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.

from hypothesis import strategies as st
from hypothesis.extra.pandas import column, columns, data_frames, range_indexes
import pandas as pd
from scipy.stats import special_ortho_group
from typing import assert_never

from pyhegp.serialization import Summary, is_genotype_metadata_column, is_phenotype_metadata_column
from pyhegp.utils import negate

tabless_printable_ascii_text = st.text(
    # Exclude control characters and tab.
    st.characters(codec="ascii",
                  exclude_categories=("Cc",),
                  exclude_characters=("\t",)),
    min_size=1)

chromosomes = tabless_printable_ascii_text
positions = st.integers(min_value=0,
                        max_value=10*10**9)
references = st.text(st.characters(codec="ascii",
                                   categories=(),
                                   include_characters=("A", "G", "C", "T")),
                     min_size=1)

sample_names = (tabless_printable_ascii_text
                .filter(negate(is_genotype_metadata_column)))

def genotype_metadata(draw, number_of_snps, reference_present):
    match list(zip(*draw(st.lists(st.tuples(chromosomes, positions, references)
                                  if reference_present
                                  else st.tuples(chromosomes, positions),
                                  min_size=number_of_snps,
                                  max_size=number_of_snps,
                                  unique=True)))):
        case []:
            return pd.DataFrame({"chromosome": pd.Series(dtype="str"),
                                 "position": pd.Series(dtype="int")}
                                | ({"reference": pd.Series(dtype="str")}
                                   if reference_present else {}))
        case chromosomes_lst, positions_lst, *references_lst:
            return pd.DataFrame({"chromosome": pd.Series(chromosomes_lst, dtype="str"),
                                 "position": pd.Series(positions_lst, dtype="int")}
                                | ({"reference": pd.Series(*references_lst, dtype="str")}
                                   if reference_present else {}))
        case _ as unreachable:
            assert_never(unreachable)

@st.composite
def summaries(draw):
    stats = draw(data_frames(
        columns=columns(["mean", "std"],
                        dtype="float64",
                        elements=st.floats(allow_nan=False))))
    return Summary(draw(st.integers()),
                   pd.concat((genotype_metadata(draw,
                                                len(stats),
                                                draw(st.booleans())),
                              stats),
                             axis="columns"))

@st.composite
def genotype_frames(draw,
                    number_of_samples=st.integers(min_value=0,
                                                  max_value=10),
                    reference_present=st.booleans()):
    _number_of_samples = draw(number_of_samples)
    dosages = draw(data_frames(
        columns=columns(draw(st.lists(sample_names,
                                      min_size=_number_of_samples,
                                      max_size=_number_of_samples,
                                      unique=True)),
                        dtype="float64",
                        elements=st.floats(min_value=0,
                                           max_value=100,
                                           allow_nan=False))))
    return pd.concat((genotype_metadata(draw,
                                        len(dosages),
                                        draw(reference_present)),
                      dosages),
                     axis="columns")

phenotype_names = st.lists(tabless_printable_ascii_text
                           .filter(negate(is_phenotype_metadata_column)),
                           unique=True)

@st.composite
def phenotype_frames(draw,
                     number_of_samples=st.integers(min_value=0,
                                                   max_value=10)):
    _number_of_samples = draw(number_of_samples)
    return draw(data_frames(
        columns=([column(name="sample-id",
                         dtype="str",
                         elements=tabless_printable_ascii_text,
                         unique=True)]
                 + columns(draw(phenotype_names),
                           dtype="float64",
                           elements=st.floats(min_value=-1000,
                                              max_value=1000,
                                              allow_nan=False))),
        index=range_indexes(min_size=_number_of_samples,
                            max_size=_number_of_samples)))

@st.composite
def keys(draw, size):
    return (special_ortho_group(draw(size),
                                seed=draw(st.integers(min_value=0,
                                                      max_value=2**32-1)))
            .rvs())