diff options
-rw-r--r-- | .guix/pyhegp-package.scm | 3 | ||||
-rw-r--r-- | doc/file-formats.md | 13 | ||||
-rw-r--r-- | pyhegp/pyhegp.py | 40 | ||||
-rw-r--r-- | pyhegp/serialization.py | 47 | ||||
-rw-r--r-- | pyproject.toml | 1 | ||||
-rw-r--r-- | tests/test_serialization.py | 76 |
6 files changed, 138 insertions, 42 deletions
diff --git a/.guix/pyhegp-package.scm b/.guix/pyhegp-package.scm index cf132c5..6778e38 100644 --- a/.guix/pyhegp-package.scm +++ b/.guix/pyhegp-package.scm @@ -20,7 +20,7 @@ #:use-module ((gnu packages check) #:select (python-hypothesis-next)) #:use-module ((gnu packages check) #:select (python-pytest) #:prefix guix:) #:use-module ((gnu packages python-build) #:select (python-flit-core)) - #:use-module ((gnu packages python-science) #:select (python-scipy)) + #:use-module ((gnu packages python-science) #:select (python-pandas python-scipy)) #:use-module ((gnu packages python-xyz) #:select (python-click python-numpy)) #:use-module (guix build-system pyproject) #:use-module (guix gexp) @@ -53,6 +53,7 @@ (propagated-inputs (list python-click python-numpy + python-pandas python-scipy)) (home-page "https://github.com/encryption4genetics/pyhegp") (synopsis "Homomorphic encryption of genotypes and phenotypes") diff --git a/doc/file-formats.md b/doc/file-formats.md index 4d3bfcd..be8162f 100644 --- a/doc/file-formats.md +++ b/doc/file-formats.md @@ -5,7 +5,18 @@ The summary file is ASCII encoded. It consists of two sections—the header and The first line of the header section MUST be `# pyhegp summary file version 1`. Subsequent lines of the header section are a list of key-value pairs. Each line MUST be `#`, optional whitespace, the key, a single space character and then the value. The key MUST NOT contain whitespace or control characters, and MUST NOT begin with a `#` character. The value MAY contain whitespace characters, but MUST NOT contain control characters. -The data section is a tab-separated table of numbers. The first line of the data section is a vector of means—one for each SNP. The second line is a vector of standard deviations—one for each SNP. +The data section is a tab-separated table of numbers. The first line MUST be a header with column labels. Each row corresponds to one SNP. The columns labelled `chromosome`, `position`, `reference`, `mean` and `standard-deviation` contain the chromosome, the position of the SNP on the chromosome, the reference allele, the mean dosage and the standard deviation of the dosage for that SNP. Column labels are case-sensitive. + +The `reference` column is optional, and SHOULD be absent in pooled summary files. Here is an example summary file. `TODO: Add example.` + +## genotype file + +The genotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one SNP. The columns labelled `chromosome`, `position` and `reference` contain the chromosome, the position on the chromosome and the reference allele for that SNP. Other columns each contain dosage values for one sample. The headers of these columns MUST be their sample identifiers. Column headers are case-sensitive. + +the `reference` column is optional, and should be absent in encrypted genotype files. + +Here is an example genotype file. +`TODO: Add example.` diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py index 8de6f9d..ddc796f 100644 --- a/pyhegp/pyhegp.py +++ b/pyhegp/pyhegp.py @@ -20,6 +20,7 @@ from collections import namedtuple import click import numpy as np +import pandas as pd from scipy.stats import special_ortho_group from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype @@ -67,10 +68,14 @@ def main(): help="output file") def summary(genotype_file, summary_file): genotype = read_genotype(genotype_file) + matrix = genotype.drop(columns=["chromosome", "position", "reference"]).to_numpy() write_summary(summary_file, Summary(genotype.shape[0], - np.mean(genotype, axis=0), - np.std(genotype, axis=0))) + pd.DataFrame({"chromosome": genotype.chromosome, + "position": genotype.position, + "reference": genotype.reference, + "mean": np.mean(matrix, axis=1), + "std": np.std(matrix, axis=1)}))) @main.command() @click.option("--output", "-o", "pooled_summary_file", @@ -80,12 +85,16 @@ def summary(genotype_file, summary_file): @click.argument("summary-files", type=click.File("rb"), nargs=-1) def pool(pooled_summary_file, summary_files): summaries = [read_summary(file) for file in summary_files] - pooled_stats = pool_stats([Stats(summary.n, summary.mean, summary.std) + pooled_stats = pool_stats([Stats(summary.n, + summary.data["mean"].to_numpy(), + summary.data["std"].to_numpy()) for summary in summaries]) write_summary(pooled_summary_file, Summary(pooled_stats.n, - pooled_stats.mean, - pooled_stats.std)) + pd.concat((summaries[0].data[["chromosome", "position"]], + pd.DataFrame({"mean": pooled_stats.mean, + "std": pooled_stats.std})), + axis="columns"))) @main.command() @click.argument("genotype-file", type=click.File("r")) @@ -99,16 +108,23 @@ def pool(pooled_summary_file, summary_files): help="Output ciphertext") def encrypt(genotype_file, summary_file, key_file, ciphertext_file): genotype = read_genotype(genotype_file) + sample_names = genotype.drop(columns=["chromosome", "position", "reference"]).columns + genotype_matrix = genotype[sample_names].to_numpy().T summary = read_summary(summary_file) rng = np.random.default_rng() - key = random_key(rng, len(genotype)) - encrypted_genotype = hegp_encrypt(standardize(genotype, - summary.mean, - summary.std), - key) + key = random_key(rng, len(genotype_matrix)) + encrypted_genotype_matrix = hegp_encrypt(standardize( + genotype_matrix, + summary.data["mean"].to_numpy(), + summary.data["std"].to_numpy()), + key) if key_file: np.savetxt(key_file, key, delimiter=",", fmt="%f") - write_genotype(ciphertext_file, encrypted_genotype) + write_genotype(ciphertext_file, + pd.concat((genotype[["chromosome", "position"]], + pd.DataFrame(encrypted_genotype_matrix.T, + columns=sample_names)), + axis="columns")) @main.command() @click.option("--output", "-o", "output_file", @@ -118,7 +134,7 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file): @click.argument("ciphertext-files", type=click.File("rb"), nargs=-1) def cat(output_file, ciphertext_files): write_genotype(output_file, - np.vstack([read_genotype(file) for file in ciphertext_files])) + pd.concat([read_genotype(file) for file in ciphertext_files])) if __name__ == "__main__": main() diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py index b0be568..77fa4a5 100644 --- a/pyhegp/serialization.py +++ b/pyhegp/serialization.py @@ -17,13 +17,14 @@ ### along with pyhegp. If not, see <https://www.gnu.org/licenses/>. from collections import namedtuple +import csv from itertools import takewhile -import numpy as np +import pandas as pd SUMMARY_HEADER = b"# pyhegp summary file version 1\n" -Summary = namedtuple("Summary", "n mean std") +Summary = namedtuple("Summary", "n data") def peek(file): c = file.read(1) @@ -43,18 +44,46 @@ def read_summary_headers(file): def read_summary(file): headers = read_summary_headers(file) return Summary(int(headers["number-of-samples"]), - *np.loadtxt(file, ndmin=2, delimiter="\t")) + pd.read_csv(file, + sep="\t", + header=0, + dtype={"chromosome": "str", + "position": "int", + "reference": "str", + "mean": "float", + "standard-deviation": "float"}, + na_filter=False) + .rename(columns={"standard-deviation": "std"})) def write_summary(file, summary): file.write(SUMMARY_HEADER) file.write(f"# number-of-samples {summary.n}\n".encode("ascii")) - np.savetxt(file, - np.row_stack((summary.mean, summary.std)), - delimiter="\t", - fmt="%.8g") + (summary.data + .rename(columns={"std": "standard-deviation"}) + .to_csv(file, + sep="\t", + float_format="%.8g", + index=False)) def read_genotype(file): - return np.loadtxt(file, delimiter=",", ndmin=2) + df = pd.read_csv(file, + quoting=csv.QUOTE_NONE, + sep="\t", + na_filter=False) + sample_columns = [column + for column in df.columns + if column not in ["chromosome", "position", "reference"]] + df.chromosome = df.chromosome.astype("str") + df.position = df.position.astype("int") + if "reference" in df: + df.reference = df.reference.astype("str") + df[sample_columns] = df[sample_columns].astype("float") + return df def write_genotype(file, genotype): - np.savetxt(file, genotype, delimiter=",", fmt="%.8g") + (genotype + .to_csv(file, + quoting=csv.QUOTE_NONE, + sep="\t", + float_format="%.8g", + index=False)) diff --git a/pyproject.toml b/pyproject.toml index c0a6ab2..636a56d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ license = {file = "COPYING"} dependencies = [ "click", "numpy", + "pandas", "scipy" ] diff --git a/tests/test_serialization.py b/tests/test_serialization.py index b234f9d..15de278 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -19,26 +19,50 @@ import tempfile from hypothesis import given, strategies as st -from hypothesis.extra.numpy import arrays, array_shapes -from pytest import approx +from hypothesis.extra.pandas import column, columns, data_frames +import pandas as pd from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype +from pyhegp.utils import negate -@given(st.integers(), - arrays("float64", - st.shared(array_shapes(max_dims=1), key="number-of-snps"), - elements=st.floats()), - arrays("float64", - st.shared(array_shapes(max_dims=1), key="number-of-snps"), - elements=st.floats())) -def test_read_write_summary_are_inverses(n, mean, std): +tabless_printable_ascii_text = st.text( + # Exclude control characters and tab. + st.characters(codec="ascii", + exclude_categories=("Cc",), + exclude_characters=("\t",)), + min_size=1) +chromosome_column = column(name="chromosome", + dtype="str", + elements=tabless_printable_ascii_text) +position_column = column(name="position", + dtype="int") +reference_column = column(name="reference", + dtype="str", + elements=st.text( + st.characters(codec="ascii", + categories=(), + include_characters=("A", "G", "C", "T")), + min_size=1)) + +@st.composite +def summaries(draw): + return Summary(draw(st.integers()), + draw(data_frames( + columns=([chromosome_column, position_column] + + ([reference_column] if draw(st.booleans()) else []) + + columns(["mean", "std"], + dtype="float64", + elements=st.floats(allow_nan=False)))))) + +@given(summaries()) +def test_read_write_summary_are_inverses(summary): with tempfile.TemporaryFile() as file: - write_summary(file, Summary(n, mean, std)) + write_summary(file, summary) file.seek(0) - summary = read_summary(file) - assert ((summary.n == n) and - (summary.mean == approx(mean, nan_ok=True)) and - (summary.std == approx(std, nan_ok=True))) + recovered_summary = read_summary(file) + pd.testing.assert_frame_equal(summary.data, + recovered_summary.data) + assert summary.n == recovered_summary.n @st.composite def properties_and_whitespace(draw): @@ -69,11 +93,25 @@ def test_read_summary_headers_variable_whitespace(properties_and_whitespace): file.seek(0) assert properties == read_summary_headers(file) -@given(arrays("float64", - array_shapes(min_dims=2, max_dims=2), - elements=st.floats(min_value=0, max_value=100))) +def genotype_reserved_column_name_p(name): + return name.lower() in {"chromosome", "position", "reference"} + +sample_names = st.lists(tabless_printable_ascii_text + .filter(negate(genotype_reserved_column_name_p)), + unique=True) + +@st.composite +def genotype_frames(draw): + return draw(data_frames( + columns=([chromosome_column, position_column] + + ([reference_column] if draw(st.booleans()) else []) + + columns(draw(sample_names), + dtype="float64", + elements=st.floats(allow_nan=False))))) + +@given(genotype_frames()) def test_read_write_genotype_are_inverses(genotype): with tempfile.TemporaryFile() as file: write_genotype(file, genotype) file.seek(0) - assert genotype == approx(read_genotype(file)) + pd.testing.assert_frame_equal(genotype, read_genotype(file)) |