about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2025-08-04 12:52:39 +0100
committerArun Isaac2025-08-06 22:40:41 +0100
commitbcdb235949c06db07172b0c6355a0059436b86fb (patch)
treeef2a4ab4ea6d3da60a894b50eb7a2470021852f1
parent2dc2efa7f77deb5ebcf7b80abefc162474614b2c (diff)
downloadpyhegp-bcdb235949c06db07172b0c6355a0059436b86fb.tar.gz
pyhegp-bcdb235949c06db07172b0c6355a0059436b86fb.tar.lz
pyhegp-bcdb235949c06db07172b0c6355a0059436b86fb.zip
Standardize file formats in the likeness of plink files.
* pyhegp/pyhegp.py: Import pandas.
(summary, pool, encrypt, cat): Use pandas data frames and new data
format.
* pyhegp/serialization.py: Import csv and pandas.
(Summary)[mean, std]: Delete fields.
[data]: New field.
(read_summary, write_summary, read_genotype, write_genotype): Use
pandas data frames and new data format.
* tests/test_serialization.py: Import column, columns and data_frames
from hypothesis.extra.pandas; pandas; negate from pyhegp.utils. Do not
import hypothesis.extra.numpy and approx from pytest.
(tabless_printable_ascii_text, chromosome_column, position_column,
reference_column, sample_names): New variables.
(summaries, genotype_reserved_column_name_p, genotype_frames): New
functions.
(test_read_write_summary_are_inverses): Use pandas data frames and new
data format.
(test_read_write_genotype_are_inverses): Use pandas for testing.
* doc/file-formats.md (File formats)[summary file]: Describe new
standard.
[genotype file]: New section.
* .guix/pyhegp-package.scm (pyhegp-package): Import python-pandas
from (gnu packages python-science).
(python-pyhegp)[propagated-inputs]: Add python-pandas.
* pyproject.toml (dependencies): Add pandas.
-rw-r--r--.guix/pyhegp-package.scm3
-rw-r--r--doc/file-formats.md13
-rw-r--r--pyhegp/pyhegp.py40
-rw-r--r--pyhegp/serialization.py47
-rw-r--r--pyproject.toml1
-rw-r--r--tests/test_serialization.py76
6 files changed, 138 insertions, 42 deletions
diff --git a/.guix/pyhegp-package.scm b/.guix/pyhegp-package.scm
index cf132c5..6778e38 100644
--- a/.guix/pyhegp-package.scm
+++ b/.guix/pyhegp-package.scm
@@ -20,7 +20,7 @@
   #:use-module ((gnu packages check) #:select (python-hypothesis-next))
   #:use-module ((gnu packages check) #:select (python-pytest) #:prefix guix:)
   #:use-module ((gnu packages python-build) #:select (python-flit-core))
-  #:use-module ((gnu packages python-science) #:select (python-scipy))
+  #:use-module ((gnu packages python-science) #:select (python-pandas python-scipy))
   #:use-module ((gnu packages python-xyz) #:select (python-click python-numpy))
   #:use-module (guix build-system pyproject)
   #:use-module (guix gexp)
@@ -53,6 +53,7 @@
     (propagated-inputs
      (list python-click
            python-numpy
+           python-pandas
            python-scipy))
     (home-page "https://github.com/encryption4genetics/pyhegp")
     (synopsis "Homomorphic encryption of genotypes and phenotypes")
diff --git a/doc/file-formats.md b/doc/file-formats.md
index 4d3bfcd..be8162f 100644
--- a/doc/file-formats.md
+++ b/doc/file-formats.md
@@ -5,7 +5,18 @@ The summary file is ASCII encoded. It consists of two sections—the header and
 
 The first line of the header section MUST be `# pyhegp summary file version 1`. Subsequent lines of the header section are a list of key-value pairs. Each line MUST be `#`, optional whitespace, the key, a single space character and then the value. The key MUST NOT contain whitespace or control characters, and MUST NOT begin with a `#` character. The value MAY contain whitespace characters, but MUST NOT contain control characters.
 
-The data section is a tab-separated table of numbers. The first line of the data section is a vector of means—one for each SNP. The second line is a vector of standard deviations—one for each SNP.
+The data section is a tab-separated table of numbers. The first line MUST be a header with column labels. Each row corresponds to one SNP. The columns labelled `chromosome`, `position`, `reference`, `mean` and `standard-deviation` contain the chromosome, the position of the SNP on the chromosome, the reference allele, the mean dosage and the standard deviation of the dosage for that SNP. Column labels are case-sensitive.
+
+The `reference` column is optional, and SHOULD be absent in pooled summary files.
 
 Here is an example summary file.
 `TODO: Add example.`
+
+## genotype file
+
+The genotype file is a tab-separated values (TSV) file. The first line MUST be a header with column labels. Each row corresponds to one SNP. The columns labelled `chromosome`, `position` and `reference` contain the chromosome, the position on the chromosome and the reference allele for that SNP. Other columns each contain dosage values for one sample. The headers of these columns MUST be their sample identifiers. Column headers are case-sensitive.
+
+the `reference` column is optional, and should be absent in encrypted genotype files.
+
+Here is an example genotype file.
+`TODO: Add example.`
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py
index 8de6f9d..ddc796f 100644
--- a/pyhegp/pyhegp.py
+++ b/pyhegp/pyhegp.py
@@ -20,6 +20,7 @@ from collections import namedtuple
 
 import click
 import numpy as np
+import pandas as pd
 from scipy.stats import special_ortho_group
 
 from pyhegp.serialization import Summary, read_summary, write_summary, read_genotype, write_genotype
@@ -67,10 +68,14 @@ def main():
               help="output file")
 def summary(genotype_file, summary_file):
     genotype = read_genotype(genotype_file)
+    matrix = genotype.drop(columns=["chromosome", "position", "reference"]).to_numpy()
     write_summary(summary_file,
                   Summary(genotype.shape[0],
-                          np.mean(genotype, axis=0),
-                          np.std(genotype, axis=0)))
+                          pd.DataFrame({"chromosome": genotype.chromosome,
+                                        "position": genotype.position,
+                                        "reference": genotype.reference,
+                                        "mean": np.mean(matrix, axis=1),
+                                        "std": np.std(matrix, axis=1)})))
 
 @main.command()
 @click.option("--output", "-o", "pooled_summary_file",
@@ -80,12 +85,16 @@ def summary(genotype_file, summary_file):
 @click.argument("summary-files", type=click.File("rb"), nargs=-1)
 def pool(pooled_summary_file, summary_files):
     summaries = [read_summary(file) for file in summary_files]
-    pooled_stats = pool_stats([Stats(summary.n, summary.mean, summary.std)
+    pooled_stats = pool_stats([Stats(summary.n,
+                                     summary.data["mean"].to_numpy(),
+                                     summary.data["std"].to_numpy())
                                for summary in summaries])
     write_summary(pooled_summary_file,
                   Summary(pooled_stats.n,
-                          pooled_stats.mean,
-                          pooled_stats.std))
+                          pd.concat((summaries[0].data[["chromosome", "position"]],
+                                     pd.DataFrame({"mean": pooled_stats.mean,
+                                                   "std": pooled_stats.std})),
+                                    axis="columns")))
 
 @main.command()
 @click.argument("genotype-file", type=click.File("r"))
@@ -99,16 +108,23 @@ def pool(pooled_summary_file, summary_files):
               help="Output ciphertext")
 def encrypt(genotype_file, summary_file, key_file, ciphertext_file):
     genotype = read_genotype(genotype_file)
+    sample_names = genotype.drop(columns=["chromosome", "position", "reference"]).columns
+    genotype_matrix = genotype[sample_names].to_numpy().T
     summary = read_summary(summary_file)
     rng = np.random.default_rng()
-    key = random_key(rng, len(genotype))
-    encrypted_genotype = hegp_encrypt(standardize(genotype,
-                                                  summary.mean,
-                                                  summary.std),
-                                      key)
+    key = random_key(rng, len(genotype_matrix))
+    encrypted_genotype_matrix = hegp_encrypt(standardize(
+        genotype_matrix,
+        summary.data["mean"].to_numpy(),
+        summary.data["std"].to_numpy()),
+                                             key)
     if key_file:
         np.savetxt(key_file, key, delimiter=",", fmt="%f")
-    write_genotype(ciphertext_file, encrypted_genotype)
+    write_genotype(ciphertext_file,
+                   pd.concat((genotype[["chromosome", "position"]],
+                              pd.DataFrame(encrypted_genotype_matrix.T,
+                                           columns=sample_names)),
+                             axis="columns"))
 
 @main.command()
 @click.option("--output", "-o", "output_file",
@@ -118,7 +134,7 @@ def encrypt(genotype_file, summary_file, key_file, ciphertext_file):
 @click.argument("ciphertext-files", type=click.File("rb"), nargs=-1)
 def cat(output_file, ciphertext_files):
     write_genotype(output_file,
-                   np.vstack([read_genotype(file) for file in ciphertext_files]))
+                   pd.concat([read_genotype(file) for file in ciphertext_files]))
 
 if __name__ == "__main__":
     main()
diff --git a/pyhegp/serialization.py b/pyhegp/serialization.py
index b0be568..77fa4a5 100644
--- a/pyhegp/serialization.py
+++ b/pyhegp/serialization.py
@@ -17,13 +17,14 @@
 ### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
 
 from collections import namedtuple
+import csv
 from itertools import takewhile
 
-import numpy as np
+import pandas as pd
 
 SUMMARY_HEADER = b"# pyhegp summary file version 1\n"
 
-Summary = namedtuple("Summary", "n mean std")
+Summary = namedtuple("Summary", "n data")
 
 def peek(file):
     c = file.read(1)
@@ -43,18 +44,46 @@ def read_summary_headers(file):
 def read_summary(file):
     headers = read_summary_headers(file)
     return Summary(int(headers["number-of-samples"]),
-                   *np.loadtxt(file, ndmin=2, delimiter="\t"))
+                   pd.read_csv(file,
+                               sep="\t",
+                               header=0,
+                               dtype={"chromosome": "str",
+                                      "position": "int",
+                                      "reference": "str",
+                                      "mean": "float",
+                                      "standard-deviation": "float"},
+                               na_filter=False)
+                   .rename(columns={"standard-deviation": "std"}))
 
 def write_summary(file, summary):
     file.write(SUMMARY_HEADER)
     file.write(f"# number-of-samples {summary.n}\n".encode("ascii"))
-    np.savetxt(file,
-               np.row_stack((summary.mean, summary.std)),
-               delimiter="\t",
-               fmt="%.8g")
+    (summary.data
+     .rename(columns={"std": "standard-deviation"})
+     .to_csv(file,
+             sep="\t",
+             float_format="%.8g",
+             index=False))
 
 def read_genotype(file):
-    return np.loadtxt(file, delimiter=",", ndmin=2)
+    df = pd.read_csv(file,
+                     quoting=csv.QUOTE_NONE,
+                     sep="\t",
+                     na_filter=False)
+    sample_columns = [column
+                      for column in df.columns
+                      if column not in ["chromosome", "position", "reference"]]
+    df.chromosome = df.chromosome.astype("str")
+    df.position = df.position.astype("int")
+    if "reference" in df:
+        df.reference = df.reference.astype("str")
+    df[sample_columns] = df[sample_columns].astype("float")
+    return df
 
 def write_genotype(file, genotype):
-    np.savetxt(file, genotype, delimiter=",", fmt="%.8g")
+    (genotype
+     .to_csv(file,
+             quoting=csv.QUOTE_NONE,
+             sep="\t",
+             float_format="%.8g",
+             index=False))
diff --git a/pyproject.toml b/pyproject.toml
index c0a6ab2..636a56d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ license = {file = "COPYING"}
 dependencies = [
   "click",
   "numpy",
+  "pandas",
   "scipy"
 ]
 
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index b234f9d..15de278 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -19,26 +19,50 @@
 import tempfile
 
 from hypothesis import given, strategies as st
-from hypothesis.extra.numpy import arrays, array_shapes
-from pytest import approx
+from hypothesis.extra.pandas import column, columns, data_frames
+import pandas as pd
 
 from pyhegp.serialization import Summary, read_summary, write_summary, read_summary_headers, read_genotype, write_genotype
+from pyhegp.utils import negate
 
-@given(st.integers(),
-       arrays("float64",
-              st.shared(array_shapes(max_dims=1), key="number-of-snps"),
-              elements=st.floats()),
-       arrays("float64",
-              st.shared(array_shapes(max_dims=1), key="number-of-snps"),
-              elements=st.floats()))
-def test_read_write_summary_are_inverses(n, mean, std):
+tabless_printable_ascii_text = st.text(
+    # Exclude control characters and tab.
+    st.characters(codec="ascii",
+                  exclude_categories=("Cc",),
+                  exclude_characters=("\t",)),
+    min_size=1)
+chromosome_column = column(name="chromosome",
+                           dtype="str",
+                           elements=tabless_printable_ascii_text)
+position_column = column(name="position",
+                         dtype="int")
+reference_column = column(name="reference",
+                          dtype="str",
+                          elements=st.text(
+                              st.characters(codec="ascii",
+                                            categories=(),
+                                            include_characters=("A", "G", "C", "T")),
+                              min_size=1))
+
+@st.composite
+def summaries(draw):
+    return Summary(draw(st.integers()),
+                   draw(data_frames(
+                       columns=([chromosome_column, position_column]
+                                + ([reference_column] if draw(st.booleans()) else [])
+                                + columns(["mean", "std"],
+                                          dtype="float64",
+                                          elements=st.floats(allow_nan=False))))))
+
+@given(summaries())
+def test_read_write_summary_are_inverses(summary):
     with tempfile.TemporaryFile() as file:
-        write_summary(file, Summary(n, mean, std))
+        write_summary(file, summary)
         file.seek(0)
-        summary = read_summary(file)
-        assert ((summary.n == n) and
-                (summary.mean == approx(mean, nan_ok=True)) and
-                (summary.std == approx(std, nan_ok=True)))
+        recovered_summary = read_summary(file)
+        pd.testing.assert_frame_equal(summary.data,
+                                      recovered_summary.data)
+        assert summary.n == recovered_summary.n
 
 @st.composite
 def properties_and_whitespace(draw):
@@ -69,11 +93,25 @@ def test_read_summary_headers_variable_whitespace(properties_and_whitespace):
         file.seek(0)
         assert properties == read_summary_headers(file)
 
-@given(arrays("float64",
-              array_shapes(min_dims=2, max_dims=2),
-              elements=st.floats(min_value=0, max_value=100)))
+def genotype_reserved_column_name_p(name):
+    return name.lower() in {"chromosome", "position", "reference"}
+
+sample_names = st.lists(tabless_printable_ascii_text
+                        .filter(negate(genotype_reserved_column_name_p)),
+                        unique=True)
+
+@st.composite
+def genotype_frames(draw):
+    return draw(data_frames(
+        columns=([chromosome_column, position_column]
+                 + ([reference_column] if draw(st.booleans()) else [])
+                 + columns(draw(sample_names),
+                           dtype="float64",
+                           elements=st.floats(allow_nan=False)))))
+
+@given(genotype_frames())
 def test_read_write_genotype_are_inverses(genotype):
     with tempfile.TemporaryFile() as file:
         write_genotype(file, genotype)
         file.seek(0)
-        assert genotype == approx(read_genotype(file))
+        pd.testing.assert_frame_equal(genotype, read_genotype(file))