From 1ed0e16a4707222e07a68f57d231af1cd00fea73 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Mon, 14 Jul 2025 14:25:27 +0100 Subject: Implement the summary file format. * doc/file-formats.md, pyhegp/serialization.py, tests/test_serialization.py: New files. --- doc/file-formats.md | 11 +++++++ pyhegp/serialization.py | 53 ++++++++++++++++++++++++++++++++++ tests/test_serialization.py | 70 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 doc/file-formats.md create mode 100644 pyhegp/serialization.py create mode 100644 tests/test_serialization.py diff --git a/doc/file-formats.md b/doc/file-formats.md new file mode 100644 index 0000000..27dfe2a --- /dev/null +++ b/doc/file-formats.md @@ -0,0 +1,11 @@ +# File formats +## summary file + +The summary file is ASCII encoded. It consists of two sections—the header and the data. Lines MUST be terminated in the Unix style with a new line (aka line feed) character. Lines in the header section MUST be prefixed with `#`. + +The first line of the header section MUST be `# pyhegp summary file version 1`. Subsequent lines of the header section are a list of key-value pairs. Each line MUST be `#`, optional whitespace, the key, a single space character and then the value. The key MUST NOT contain whitespace or control characters, and MUST NOT begin with a `#` character. The value MAY contain whitespace characters, but MUST NOT contain control characters. + +The data section is a space separated table of numbers. The first line of the data section is a vector of means—one for each SNP. The second line is a vector of standard deviations—one for each SNP. + +Here is an example summary file. +`TODO: Add example.` diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py new file mode 100644 index 0000000..cdf1587 --- /dev/null +++ b/pyhegp/serialization.py @@ -0,0 +1,53 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see . + +from collections import namedtuple +from itertools import takewhile + +import numpy as np + +SUMMARY_HEADER = b"# pyhegp summary file version 1\n" + +Summary = namedtuple("Summary", "n mean std") + +def peek(file): + c = file.read(1) + file.seek(-1, 1) + return c + +def header_lines(file): + while peek(file) == b"#": + yield file.readline() + +def read_summary_headers(file): + assert (file.readline().decode("ascii").lstrip("#").lstrip() + == SUMMARY_HEADER.decode("ascii").lstrip("#").lstrip()) + return dict(line.decode("ascii").rstrip("\n").lstrip("#").lstrip().split(" ", maxsplit=1) + for line in header_lines(file)) + +def read_summary(file): + headers = read_summary_headers(file) + return Summary(int(headers["number-of-samples"]), + *np.loadtxt(file, ndmin=2)) + +def write_summary(file, summary): + file.write(SUMMARY_HEADER) + file.write(f"# number-of-samples {summary.n}\n".encode("ascii")) + np.savetxt(file, + np.row_stack((summary.mean, summary.std)), + fmt="%.8g") diff --git a/tests/test_serialization.py b/tests/test_serialization.py new file mode 100644 index 0000000..984a935 --- /dev/null +++ b/tests/test_serialization.py @@ -0,0 +1,70 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see . + +import tempfile + +from hypothesis import given, strategies as st +from hypothesis.extra.numpy import arrays, array_shapes +from pytest import approx + +from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers + +@given(st.integers(), + arrays("float64", + st.shared(array_shapes(max_dims=1), key="number-of-snps"), + elements=st.floats()), + arrays("float64", + st.shared(array_shapes(max_dims=1), key="number-of-snps"), + elements=st.floats())) +def test_read_write_summary_are_inverses(n, mean, std): + with tempfile.TemporaryFile() as file: + write_summary(file, Summary(n, mean, std)) + file.seek(0) + summary = read_summary(file) + assert ((summary.n == n) and + (summary.mean == approx(mean, nan_ok=True)) and + (summary.std == approx(std, nan_ok=True))) + +@st.composite +def properties_and_whitespace(draw): + n = draw(st.integers(min_value=0, max_value=10)) + return (draw(st.dictionaries(st.text(st.characters(codec="ascii", + exclude_categories=("Cc", "Zs")), + min_size=1) + .filter(lambda key: not key.startswith("#")), + st.text(st.characters(codec="ascii", + exclude_categories=("Cc",)), + min_size=1), + min_size=n, max_size=n)), + draw(st.integers(min_value=0, max_value=10)), + draw(st.lists(st.integers(min_value=0, max_value=10), + min_size=n, max_size=n))) + +@given(properties_and_whitespace()) +def test_read_summary_headers_variable_whitespace(properties_and_whitespace): + properties, header_whitespace, key_value_whitespace = properties_and_whitespace + with tempfile.TemporaryFile() as file: + file.write(b"#") + file.write(b" " * header_whitespace) + file.write(b"pyhegp summary file version 1\n") + for (key, value), key_value_whitespace in zip(properties.items(), key_value_whitespace): + file.write(b"#") + file.write(b" " * key_value_whitespace) + file.write(f"{key} {value}\n".encode("ascii")) + file.seek(0) + assert properties == read_summary_headers(file) -- cgit v1.2.3