about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2025-08-04 14:48:27 +0100
committerArun Isaac2025-08-06 22:40:42 +0100
commitbc046a25f1531386293a470e21b569f8411f2235 (patch)
tree0b28824677dc7240d7290ae494827014a1ff3f67
parent925bb7d67bcd7e5b756987093b15d21426852ba1 (diff)
downloadpyhegp-bc046a25f1531386293a470e21b569f8411f2235.tar.gz
pyhegp-bc046a25f1531386293a470e21b569f8411f2235.tar.lz
pyhegp-bc046a25f1531386293a470e21b569f8411f2235.zip
Standardize key files.
* doc/file-formats.md (File formats)[key file]: New section.
* pyhegp/serialization.py: Import numpy.
(read_key, write_key): New functions.
* pyhegp/pyhegp.py: Import write_key from pyhegp.serialization.
(encrypt): Use write_key.
* tests/test_serialization.py: Import arrays and array_shapes from
hypothesis.extra.numpy; approx from pytest; read_key and write_key
from pyhegp.serialization.
(test_read_write_key_are_inverses): New test.
-rw-r--r--doc/file-formats.md7
-rw-r--r--pyhegp/pyhegp.py4
-rw-r--r--pyhegp/serialization.py7
-rw-r--r--tests/test_serialization.py12
4 files changed, 27 insertions, 3 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md
index be8162f..02df7ef 100644
--- a/doc/file-formats.md
+++ b/doc/file-formats.md
@@ -20,3 +20,10 @@ the `reference` column is optional, and should be absent in encrypted genotype f
 
 Here is an example genotype file.
 `TODO: Add example.`
+
+## key file
+
+The key file is a tab-separated values (TSV) file with numerical data. There MUST be no column headers.
+
+Here is an example key file.
+`TODO: Add example.`
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py
index 2dd9bec..9677b98 100644
--- a/pyhegp/pyhegp.py
+++ b/pyhegp/pyhegp.py
@@ -23,7 +23,7 @@ import numpy as np
 import pandas as pd
 from scipy.stats import special_ortho_group
 
-from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype
+from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key
 
 Stats = namedtuple("Stats", "n mean std")
 
@@ -123,7 +123,7 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file):
         summary.data["std"].to_numpy()),
                                              key)
     if key_file:
-        np.savetxt(key_file, key, delimiter=",", fmt="%f")
+        write_key(key_file, key)
     write_genotype(ciphertext_file,
                    pd.concat((genotype[["chromosome", "position"]],
                               pd.DataFrame(encrypted_genotype_matrix.T,
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index 77fa4a5..a17c966 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -20,6 +20,7 @@ from collections import namedtuple
 import csv
 from itertools import takewhile
 
+import numpy as np
 import pandas as pd
 
 SUMMARY_HEADER = b"# pyhegp summary file version 1\n"
@@ -87,3 +88,9 @@ def write_genotype(file, genotype):
              sep="\t",
              float_format="%.8g",
              index=False))
+
+def read_key(file):
+    return np.loadtxt(file, delimiter="\t", ndmin=2)
+
+def write_key(file, key):
+    return np.savetxt(file, key, delimiter="\t", fmt="%.8g")
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index 15de278..a473796 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -19,10 +19,12 @@
 import tempfile
 
 from hypothesis import given, strategies as st
+from hypothesis.extra.numpy import arrays, array_shapes
 from hypothesis.extra.pandas import column, columns, data_frames
 import pandas as pd
+from pytest import approx
 
-from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype
+from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_key, write_key
 from pyhegp.utils import negate
 
 tabless_printable_ascii_text = st.text(
@@ -115,3 +117,11 @@ def test_read_write_genotype_are_inverses(genotype):
         write_genotype(file, genotype)
         file.seek(0)
         pd.testing.assert_frame_equal(genotype, read_genotype(file))
+
+@given(arrays("float64",
+              array_shapes(min_dims=2, max_dims=2)))
+def test_read_write_key_are_inverses(key):
+    with tempfile.TemporaryFile() as file:
+        write_key(file, key)
+        file.seek(0)
+        assert key == approx(read_key(file), nan_ok=True)