3 files changed, 134 insertions, 0 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md
new file mode 100644
index 0000000..27dfe2a
--- /dev/null
+++ b/doc/file-formats.md
@@ -0,0 +1,11 @@
+# File formats
+## summary file
+
+The summary file is ASCII encoded. It consists of two sections—the header and the data. Lines MUST be terminated in the Unix style with a new line (aka line feed) character. Lines in the header section MUST be prefixed with `#`.
+
+The first line of the header section MUST be `# pyhegp summary file version 1`. Subsequent lines of the header section are a list of key-value pairs. Each line MUST be `#`, optional whitespace, the key, a single space character and then the value. The key MUST NOT contain whitespace or control characters, and MUST NOT begin with a `#` character. The value MAY contain whitespace characters, but MUST NOT contain control characters.
+
+The data section is a space separated table of numbers. The first line of the data section is a vector of means—one for each SNP. The second line is a vector of standard deviations—one for each SNP.
+
+Here is an example summary file.
+`TODO: Add example.`
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
new file mode 100644
index 0000000..cdf1587
--- /dev/null
+++ b/pyhegp/serialization.py
@@ -0,0 +1,53 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+from collections import namedtuple
+from itertools import takewhile
+
+import numpy as np
+
+SUMMARY_HEADER = b"# pyhegp summary file version 1\n"
+
+Summary = namedtuple("Summary", "n mean std")
+
+def peek(file):
+    c = file.read(1)
+    file.seek(-1, 1)
+    return c
+
+def header_lines(file):
+    while peek(file) == b"#":
+        yield file.readline()
+
+def read_summary_headers(file):
+    assert (file.readline().decode("ascii").lstrip("#").lstrip()
+            == SUMMARY_HEADER.decode("ascii").lstrip("#").lstrip())
+    return dict(line.decode("ascii").rstrip("\n").lstrip("#").lstrip().split(" ", maxsplit=1)
+                for line in header_lines(file))
+
+def read_summary(file):
+    headers = read_summary_headers(file)
+    return Summary(int(headers["number-of-samples"]),
+                   *np.loadtxt(file, ndmin=2))
+
+def write_summary(file, summary):
+    file.write(SUMMARY_HEADER)
+    file.write(f"# number-of-samples {summary.n}\n".encode("ascii"))
+    np.savetxt(file,
+               np.row_stack((summary.mean, summary.std)),
+               fmt="%.8g")
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
new file mode 100644
index 0000000..984a935
--- /dev/null
+++ b/tests/test_serialization.py
@@ -0,0 +1,70 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+import tempfile
+
+from hypothesis import given, strategies as st
+from hypothesis.extra.numpy import arrays, array_shapes
+from pytest import approx
+
+from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers
+
+@given(st.integers(),
+       arrays("float64",
+              st.shared(array_shapes(max_dims=1), key="number-of-snps"),
+              elements=st.floats()),
+       arrays("float64",
+              st.shared(array_shapes(max_dims=1), key="number-of-snps"),
+              elements=st.floats()))
+def test_read_write_summary_are_inverses(n, mean, std):
+    with tempfile.TemporaryFile() as file:
+        write_summary(file, Summary(n, mean, std))
+        file.seek(0)
+        summary = read_summary(file)
+        assert ((summary.n == n) and
+                (summary.mean == approx(mean, nan_ok=True)) and
+                (summary.std == approx(std, nan_ok=True)))
+
+@st.composite
+def properties_and_whitespace(draw):
+    n = draw(st.integers(min_value=0, max_value=10))
+    return (draw(st.dictionaries(st.text(st.characters(codec="ascii",
+                                                       exclude_categories=("Cc", "Zs")),
+                                         min_size=1)
+                                 .filter(lambda key: not key.startswith("#")),
+                                 st.text(st.characters(codec="ascii",
+                                                       exclude_categories=("Cc",)),
+                                         min_size=1),
+                                 min_size=n, max_size=n)),
+            draw(st.integers(min_value=0, max_value=10)),
+            draw(st.lists(st.integers(min_value=0, max_value=10),
+                          min_size=n, max_size=n)))
+
+@given(properties_and_whitespace())
+def test_read_summary_headers_variable_whitespace(properties_and_whitespace):
+    properties, header_whitespace, key_value_whitespace = properties_and_whitespace
+    with tempfile.TemporaryFile() as file:
+        file.write(b"#")
+        file.write(b" " * header_whitespace)
+        file.write(b"pyhegp summary file version 1\n")
+        for (key, value), key_value_whitespace in zip(properties.items(), key_value_whitespace):
+            file.write(b"#")
+            file.write(b" " * key_value_whitespace)
+            file.write(f"{key} {value}\n".encode("ascii"))
+        file.seek(0)
+        assert properties == read_summary_headers(file)