aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/file-formats.md11
-rw-r--r--pyhegp/serialization.py53
-rw-r--r--tests/test_serialization.py70
3 files changed, 134 insertions, 0 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md
new file mode 100644
index 0000000..27dfe2a
--- /dev/null
+++ b/doc/file-formats.md
@@ -0,0 +1,11 @@
+# File formats
+## summary file
+
+The summary file is ASCII encoded. It consists of two sections—the header and the data. Lines MUST be terminated in the Unix style with a new line (aka line feed) character. Lines in the header section MUST be prefixed with `#`.
+
+The first line of the header section MUST be `# pyhegp summary file version 1`. Subsequent lines of the header section are a list of key-value pairs. Each line MUST be `#`, optional whitespace, the key, a single space character and then the value. The key MUST NOT contain whitespace or control characters, and MUST NOT begin with a `#` character. The value MAY contain whitespace characters, but MUST NOT contain control characters.
+
+The data section is a space separated table of numbers. The first line of the data section is a vector of means—one for each SNP. The second line is a vector of standard deviations—one for each SNP.
+
+Here is an example summary file.
+`TODO: Add example.`
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
new file mode 100644
index 0000000..cdf1587
--- /dev/null
+++ b/pyhegp/serialization.py
@@ -0,0 +1,53 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+from collections import namedtuple
+from itertools import takewhile
+
+import numpy as np
+
+SUMMARY_HEADER = b"# pyhegp summary file version 1\n"
+
+Summary = namedtuple("Summary", "n mean std")
+
+def peek(file):
+ c = file.read(1)
+ file.seek(-1, 1)
+ return c
+
+def header_lines(file):
+ while peek(file) == b"#":
+ yield file.readline()
+
+def read_summary_headers(file):
+ assert (file.readline().decode("ascii").lstrip("#").lstrip()
+ == SUMMARY_HEADER.decode("ascii").lstrip("#").lstrip())
+ return dict(line.decode("ascii").rstrip("\n").lstrip("#").lstrip().split(" ", maxsplit=1)
+ for line in header_lines(file))
+
+def read_summary(file):
+ headers = read_summary_headers(file)
+ return Summary(int(headers["number-of-samples"]),
+ *np.loadtxt(file, ndmin=2))
+
+def write_summary(file, summary):
+ file.write(SUMMARY_HEADER)
+ file.write(f"# number-of-samples {summary.n}\n".encode("ascii"))
+ np.savetxt(file,
+ np.row_stack((summary.mean, summary.std)),
+ fmt="%.8g")
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
new file mode 100644
index 0000000..984a935
--- /dev/null
+++ b/tests/test_serialization.py
@@ -0,0 +1,70 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+import tempfile
+
+from hypothesis import given, strategies as st
+from hypothesis.extra.numpy import arrays, array_shapes
+from pytest import approx
+
+from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers
+
+@given(st.integers(),
+ arrays("float64",
+ st.shared(array_shapes(max_dims=1), key="number-of-snps"),
+ elements=st.floats()),
+ arrays("float64",
+ st.shared(array_shapes(max_dims=1), key="number-of-snps"),
+ elements=st.floats()))
+def test_read_write_summary_are_inverses(n, mean, std):
+ with tempfile.TemporaryFile() as file:
+ write_summary(file, Summary(n, mean, std))
+ file.seek(0)
+ summary = read_summary(file)
+ assert ((summary.n == n) and
+ (summary.mean == approx(mean, nan_ok=True)) and
+ (summary.std == approx(std, nan_ok=True)))
+
+@st.composite
+def properties_and_whitespace(draw):
+ n = draw(st.integers(min_value=0, max_value=10))
+ return (draw(st.dictionaries(st.text(st.characters(codec="ascii",
+ exclude_categories=("Cc", "Zs")),
+ min_size=1)
+ .filter(lambda key: not key.startswith("#")),
+ st.text(st.characters(codec="ascii",
+ exclude_categories=("Cc",)),
+ min_size=1),
+ min_size=n, max_size=n)),
+ draw(st.integers(min_value=0, max_value=10)),
+ draw(st.lists(st.integers(min_value=0, max_value=10),
+ min_size=n, max_size=n)))
+
+@given(properties_and_whitespace())
+def test_read_summary_headers_variable_whitespace(properties_and_whitespace):
+ properties, header_whitespace, key_value_whitespace = properties_and_whitespace
+ with tempfile.TemporaryFile() as file:
+ file.write(b"#")
+ file.write(b" " * header_whitespace)
+ file.write(b"pyhegp summary file version 1\n")
+ for (key, value), key_value_whitespace in zip(properties.items(), key_value_whitespace):
+ file.write(b"#")
+ file.write(b" " * key_value_whitespace)
+ file.write(f"{key} {value}\n".encode("ascii"))
+ file.seek(0)
+ assert properties == read_summary_headers(file)