diff options
author | Arun Isaac | 2025-08-04 14:48:27 +0100 |
---|---|---|
committer | Arun Isaac | 2025-08-06 22:40:42 +0100 |
commit | bc046a25f1531386293a470e21b569f8411f2235 (patch) | |
tree | 0b28824677dc7240d7290ae494827014a1ff3f67 | |
parent | 925bb7d67bcd7e5b756987093b15d21426852ba1 (diff) | |
download | pyhegp-bc046a25f1531386293a470e21b569f8411f2235.tar.gz pyhegp-bc046a25f1531386293a470e21b569f8411f2235.tar.lz pyhegp-bc046a25f1531386293a470e21b569f8411f2235.zip |
Standardize key files.
* doc/file-formats.md (File formats)[key file]: New section. * pyhegp/serialization.py: Import numpy. (read_key, write_key): New functions. * pyhegp/pyhegp.py: Import write_key from pyhegp.serialization. (encrypt): Use write_key. * tests/test_serialization.py: Import arrays and array_shapes from hypothesis.extra.numpy; approx from pytest; read_key and write_key from pyhegp.serialization. (test_read_write_key_are_inverses): New test.
-rw-r--r-- | doc/file-formats.md | 7 | ||||
-rw-r--r-- | pyhegp/pyhegp.py | 4 | ||||
-rw-r--r-- | pyhegp/serialization.py | 7 | ||||
-rw-r--r-- | tests/test_serialization.py | 12 |
4 files changed, 27 insertions, 3 deletions
diff --git a/doc/file-formats.md b/doc/file-formats.md index be8162f..02df7ef 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -20,3 +20,10 @@ the `reference` column is optional, and should be absent in encrypted genotype f Here is an example genotype file. `TODO: Add example.` + +## key file + +The key file is a tab-separated values (TSV) file with numerical data. There MUST be no column headers. + +Here is an example key file. +`TODO: Add example.` diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 2dd9bec..9677b98 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -23,7 +23,7 @@ import numpy as np import pandas as pd from scipy.stats import special_ortho_group -from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype +from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype, write_key Stats = namedtuple("Stats", "n mean std") @@ -123,7 +123,7 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file): summary.data["std"].to_numpy()), key) if key_file: - np.savetxt(key_file, key, delimiter=",", fmt="%f") + write_key(key_file, key) write_genotype(ciphertext_file, pd.concat((genotype[["chromosome", "position"]], pd.DataFrame(encrypted_genotype_matrix.T, diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index 77fa4a5..a17c966 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -20,6 +20,7 @@ from collections import namedtuple import csv from itertools import takewhile +import numpy as np import pandas as pd SUMMARY_HEADER = b"# pyhegp summary file version 1\n" @@ -87,3 +88,9 @@ def write_genotype(file, genotype): sep="\t", float_format="%.8g", index=False)) + +def read_key(file): + return np.loadtxt(file, delimiter="\t", ndmin=2) + +def write_key(file, key): + return np.savetxt(file, key, delimiter="\t", fmt="%.8g") diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 15de278..a473796 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -19,10 +19,12 @@ import tempfile from hypothesis import given, strategies as st +from hypothesis.extra.numpy import arrays, array_shapes from hypothesis.extra.pandas import column, columns, data_frames import pandas as pd +from pytest import approx -from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype +from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype, read_key, write_key from pyhegp.utils import negate tabless_printable_ascii_text = st.text( @@ -115,3 +117,11 @@ def test_read_write_genotype_are_inverses(genotype): write_genotype(file, genotype) file.seek(0) pd.testing.assert_frame_equal(genotype, read_genotype(file)) + +@given(arrays("float64", + array_shapes(min_dims=2, max_dims=2))) +def test_read_write_key_are_inverses(key): + with tempfile.TemporaryFile() as file: + write_key(file, key) + file.seek(0) + assert key == approx(read_key(file), nan_ok=True) |