about summary refs log tree commit diff
path: root/e2e-tests/hsmice
diff options
context:
space:
mode:
authorArun Isaac2025-09-06 18:16:28 +0100
committerArun Isaac2025-09-06 18:49:49 +0100
commitb89ecd37fe19fdb31beebabd2796b74f5dc97743 (patch)
treea5cd342fcf4cae262a2483f099198908260f44e7 /e2e-tests/hsmice
parent9905a9ff160ae4688815640cc4e669c5fdfe3273 (diff)
downloadpyhegp-b89ecd37fe19fdb31beebabd2796b74f5dc97743.tar.gz
pyhegp-b89ecd37fe19fdb31beebabd2796b74f5dc97743.tar.lz
pyhegp-b89ecd37fe19fdb31beebabd2796b74f5dc97743.zip
Add end-to-end tests for hsmice dataset.
* Add hsmice dataset wrangling and test scripts.
* Add G-expression script to run test.
* Depend on the guix-bioinformatics Guix channel for r-genio.
Diffstat (limited to 'e2e-tests/hsmice')
-rw-r--r--e2e-tests/hsmice/check-qtl.py27
-rw-r--r--e2e-tests/hsmice/gwas.r72
-rw-r--r--e2e-tests/hsmice/wrangle.r80
3 files changed, 179 insertions, 0 deletions
diff --git a/e2e-tests/hsmice/check-qtl.py b/e2e-tests/hsmice/check-qtl.py
new file mode 100644
index 0000000..feae361
--- /dev/null
+++ b/e2e-tests/hsmice/check-qtl.py
@@ -0,0 +1,27 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+import sys
+
+import pandas as pd
+
+if __name__ == "__main__":
+    df = pd.read_csv(sys.argv[1], sep="\t")
+    qtl = df.query("p < 1e-10")
+    assert (qtl.chromosome == 4).all()
+    assert ((qtl.position - 137715608).abs() < 2*10**6).all()
diff --git a/e2e-tests/hsmice/gwas.r b/e2e-tests/hsmice/gwas.r
new file mode 100644
index 0000000..b342748
--- /dev/null
+++ b/e2e-tests/hsmice/gwas.r
@@ -0,0 +1,72 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+library(mixed.model.gwas)
+library(dplyr)
+library(qqman)
+library(readr)
+library(stringr)
+library(tibble)
+library(tidyr)
+
+mmgwas <- function(genotype, phenotype, phenotype_name) {
+    ## Wrangle phenotype to vector.
+    phenotype_vector = phenotype %>% pull(phenotype_name)
+    sample_ids = phenotype %>% pull("sample-id")
+    names(phenotype_vector) = sample_ids
+
+    ## Wrangle genotype to matrix.
+    genotype_matrix = genotype %>%
+        select(all_of(sample_ids)) %>%
+        as.matrix %>%
+        t
+    colnames(genotype_matrix) = genotype %>%
+        transmute(snp.id=str_c(chromosome, ":", position)) %>%
+        pull(snp.id)
+
+    ## Compute GWAS.
+    gwas = mixed.model.gwas(phenotype_vector, genotype_matrix)
+    return(tibble(chromosome=as.integer(str_split_i(colnames(gwas$pval), ":", 1)),
+                  position=as.integer(str_split_i(colnames(gwas$pval), ":", 2)),
+                  snp.id=colnames(gwas$pval),
+                  p=gwas$pval[1,]) %>%
+           drop_na())
+}
+
+run_gwas_experiment <- function(genotype_file, phenotype_file, pvalues_file) {
+    gwas = mmgwas(read_tsv(genotype_file),
+                  read_tsv(phenotype_file) %>%
+                  select("sample-id", "Biochem.ALP.resid"),
+                  "Biochem.ALP.resid")
+    write_tsv(gwas %>% select(chromosome, position, p),
+              pvalues_file)
+    manhattan(gwas, chr="chromosome", bp="position", snp="snp.id", p="p")
+}
+
+## Process command-line arguments.
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) != 3) {
+    write("Usage: Rscript hsmice.r GENOTYPE-FILE PHENOTYPE-FILE PVALUES-FILE", stderr())
+    quit(status=1)
+}
+genotype_file = args[1]
+phenotype_file = args[2]
+pvalues_file = args[3]
+
+## Run GWAS.
+run_gwas_experiment(genotype_file, phenotype_file, pvalues_file)
diff --git a/e2e-tests/hsmice/wrangle.r b/e2e-tests/hsmice/wrangle.r
new file mode 100644
index 0000000..f47cf23
--- /dev/null
+++ b/e2e-tests/hsmice/wrangle.r
@@ -0,0 +1,80 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+library(dplyr)
+library(genio)
+library(purrr)
+library(readr)
+library(tibble)
+library(tidyr)
+
+mean_sans_rm = partial(mean, na.rm=TRUE)
+
+## Process command-line arguments.
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) != 2) {
+    write("Usage: Rscript hsmice.r INPUT_DIRECTORY OUTPUT_DIRECTORY", stderr())
+    quit(status=1)
+}
+input_data_directory = args[1]
+output_directory = args[2]
+
+## Read phenotype data.
+phenotype = read.table(file.path(input_data_directory, "HSmice.phe"),
+                       header=TRUE) %>%
+    select(id, sex, Anx.resid, BurrowedPelletWeight.resid, Context.resid,
+           End.Weight.resid, Explore.resid, FN.preWeight.resid, Freeze.resid,
+           Glucose.Weight.resid, OFT.Boli.resid, OFT.CenterTime.resid,
+           OFT.Latency.resid, OFT.TotalActivity.resid, Start.Weight.resid,
+           Weight.GrowthSlope.resid, Biochem.ALP.resid) %>%
+    drop_na() %>%
+    rename("sample-id"="id")
+sample_ids = phenotype %>% pull("sample-id")
+
+## Read genotype data, replace NAs by mean, and subset.
+p = read_plink(file.path(input_data_directory, "HSmice.bed"))
+genotype_matrix = t(apply(p$X, 1, function(x) ifelse(is.na(x), mean_sans_rm(x), x))) %>%
+    data.frame() %>%
+    select(all_of(sample_ids))
+genotype = bind_cols(data.frame(chromosome=p$bim$chr,
+                                position=p$bim$pos,
+                                reference=p$bim$ref),
+                     genotype_matrix)
+
+## Write whole phenotype and the pieces.
+midpoint = length(sample_ids) %/% 2
+sample_ids1 = sample_ids[1:midpoint]
+sample_ids2 = sample_ids[(midpoint+1):length(sample_ids)]
+write_tsv(phenotype,
+          file.path(output_directory, "phenotype.tsv"))
+write_tsv(phenotype %>%
+          inner_join(tibble("sample-id"=sample_ids1)),
+          file.path(output_directory, "phenotype1.tsv"))
+write_tsv(phenotype %>%
+          inner_join(tibble("sample-id"=sample_ids2)),
+          file.path(output_directory, "phenotype2.tsv"))
+
+## Write whole genotype and the pieces.
+write_tsv(genotype,
+          file.path(output_directory, "genotype.tsv"))
+write_tsv(genotype %>%
+          select(chromosome, position, reference, all_of(sample_ids1)),
+          file.path(output_directory, "genotype1.tsv"))
+write_tsv(genotype %>%
+          select(chromosome, position, reference, all_of(sample_ids2)),
+          file.path(output_directory, "genotype2.tsv"))