1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
### pyhegp --- Homomorphic encryption of genotypes and phenotypes
### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
###
### This file is part of pyhegp.
###
### pyhegp is free software: you can redistribute it and/or modify it
### under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### pyhegp is distributed in the hope that it will be useful, but
### WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
### General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
from hypothesis import strategies as st
from hypothesis.extra.pandas import column, columns, data_frames
from pyhegp.serialization import Summary
from pyhegp.utils import negate
tabless_printable_ascii_text = st.text(
# Exclude control characters and tab.
st.characters(codec="ascii",
exclude_categories=("Cc",),
exclude_characters=("\t",)),
min_size=1)
chromosome_column = column(name="chromosome",
dtype="str",
elements=tabless_printable_ascii_text)
position_column = column(name="position",
dtype="int")
reference_column = column(name="reference",
dtype="str",
elements=st.text(
st.characters(codec="ascii",
categories=(),
include_characters=("A", "G", "C", "T")),
min_size=1))
def genotype_reserved_column_name_p(name):
return name.lower() in {"chromosome", "position", "reference"}
sample_names = st.lists(tabless_printable_ascii_text
.filter(negate(genotype_reserved_column_name_p)),
unique=True)
@st.composite
def summaries(draw):
return Summary(draw(st.integers()),
draw(data_frames(
columns=([chromosome_column, position_column]
+ ([reference_column] if draw(st.booleans()) else [])
+ columns(["mean", "std"],
dtype="float64",
elements=st.floats(allow_nan=False))))))
@st.composite
def genotype_frames(draw):
genotype = draw(data_frames(
columns=([chromosome_column, position_column]
+ ([reference_column] if draw(st.booleans()) else [])
+ columns(draw(sample_names),
dtype="float64",
elements=st.floats(allow_nan=False)))))
return genotype.drop_duplicates(subset=list(
filter(genotype_reserved_column_name_p,
genotype.columns)),
ignore_index=True)
def phenotype_reserved_column_name_p(name):
return name.lower() == "sample-id"
phenotype_names = st.lists(tabless_printable_ascii_text
.filter(negate(phenotype_reserved_column_name_p)),
unique=True)
@st.composite
def phenotype_frames(draw):
return draw(data_frames(
columns=([column(name="sample-id",
dtype="str",
elements=tabless_printable_ascii_text)]
+ columns(draw(phenotype_names),
dtype="float64",
elements=st.floats(allow_nan=False)))))
|