about summary refs log tree commit diff
diff options
context:
space:
mode:
authorArun Isaac2026-01-28 00:23:56 +0000
committerArun Isaac2026-01-28 01:18:09 +0000
commitee0b1836f33004c62d05408c4ee2b69086652131 (patch)
treed284558a4049c8e6b9f910f5c5b046c5e2b3a248
parent963e662a93291b2e5297f637e8a59947657d4325 (diff)
downloadpyhegp-ee0b1836f33004c62d05408c4ee2b69086652131.tar.gz
pyhegp-ee0b1836f33004c62d05408c4ee2b69086652131.tar.lz
pyhegp-ee0b1836f33004c62d05408c4ee2b69086652131.zip
Do not drop zero standard deviation SNPs with --only-center.
-rw-r--r--pyhegp/pyhegp.py9
-rw-r--r--test-data/genotype-with-zero-stddev-snp.tsv4
-rw-r--r--tests/test_pyhegp.py13
3 files changed, 22 insertions, 4 deletions
diff --git a/pyhegp/pyhegp.py b/pyhegp/pyhegp.py
index 48d0ea2..c9a2240 100644
--- a/pyhegp/pyhegp.py
+++ b/pyhegp/pyhegp.py
@@ -262,10 +262,11 @@ def encrypt_command(genotype_file, phenotype_file, summary_file,
     if key_output_file:
         write_key(key_output_file, key)
 
-    # Drop SNPs that have a zero standard deviation. Such SNPs have no
-    # discriminatory power in the analysis and mess with our
-    # standardization by causing a division by zero.
-    summary_subset = drop_zero_stddev_snps(summary)
+    # Drop SNPs that have a zero standard deviation. Such SNPs
+    # have no discriminatory power in the analysis and mess with
+    # our standardization by causing a division by zero. This is
+    # not a problem if we are only centering.
+    summary_subset = summary if only_center else drop_zero_stddev_snps(summary)
     if (dropped_zero_stddev_snps := len(summary.data) - len(summary_subset.data)) > 0:
         print(f"Dropped {dropped_zero_stddev_snps} SNP(s) with zero standard deviation")
 
diff --git a/test-data/genotype-with-zero-stddev-snp.tsv b/test-data/genotype-with-zero-stddev-snp.tsv
new file mode 100644
index 0000000..a78977c
--- /dev/null
+++ b/test-data/genotype-with-zero-stddev-snp.tsv
@@ -0,0 +1,4 @@
+chromosome	position	reference	sample0	sample1	sample2
+chr	0	A	2	0	2
+chr	1	A	1	0	1
+chr	2	A	1	1	1
diff --git a/tests/test_pyhegp.py b/tests/test_pyhegp.py
index 569ebe5..3192bf7 100644
--- a/tests/test_pyhegp.py
+++ b/tests/test_pyhegp.py
@@ -178,6 +178,19 @@ def test_encrypt_genotype_does_not_produce_na(genotype, key, only_center):
 def test_encrypt_phenotype_does_not_produce_na(phenotype, key):
     assert not encrypt_phenotype(phenotype, key).isna().any(axis=None)
 
+def test_encrypt_with_only_center_does_not_drop_snps(tmp_path):
+    genotype_file = Path("test-data/genotype-with-zero-stddev-snp.tsv")
+    shutil.copy(genotype_file, tmp_path)
+    result = CliRunner().invoke(main, ["encrypt",
+                                       "--only-center",
+                                       str(tmp_path / genotype_file.name)])
+    assert result.exit_code == 0
+    with genotype_file.open("rb") as file:
+        genotype = read_genotype(file)
+    with (tmp_path / f"{genotype_file.name}.hegp").open("rb") as file:
+        encrypted_genotype = read_genotype(file)
+    assert len(genotype) == len(encrypted_genotype)
+
 @pytest.mark.parametrize("summary_files",
                          [[Path("test-data/pool-test-summary1"),
                            Path("test-data/pool-test-summary2")],