diff options
-rw-r--r-- | .guix-channel | 19 | ||||
-rw-r--r-- | .guix/hsmice-test.scm | 148 | ||||
-rw-r--r-- | e2e-tests/hsmice/check-qtl.py | 27 | ||||
-rw-r--r-- | e2e-tests/hsmice/gwas.r | 72 | ||||
-rw-r--r-- | e2e-tests/hsmice/wrangle.r | 80 |
5 files changed, 345 insertions, 1 deletions
diff --git a/.guix-channel b/.guix-channel index 35e181f..f76e5dd 100644 --- a/.guix-channel +++ b/.guix-channel @@ -1,3 +1,20 @@ (channel (version 0) - (directory ".guix")) + (directory ".guix") + (dependencies + (channel + (name guix-bioinformatics) + (url "https://git.genenetwork.org/guix-bioinformatics")) + ;; Unfortunately, Guix suffers from an open bug whereby channel + ;; dependencies are not resolved recursively. Hence, we must depend + ;; on the guix-past channel explicitly. See + ;; https://issues.guix.gnu.org/68797 for details. + (channel + (name guix-past) + (url "https://codeberg.org/guix-science/guix-past") + (introduction + (channel-introduction + (version 0) + (commit "0c119db2ea86a389769f4d2b9c6f5c41c027e336") + (signer + "3CE4 6455 8A84 FDC6 9DB4 0CFB 090B 1199 3D9A EBB5")))))) diff --git a/.guix/hsmice-test.scm b/.guix/hsmice-test.scm new file mode 100644 index 0000000..ee8ecbd --- /dev/null +++ b/.guix/hsmice-test.scm @@ -0,0 +1,148 @@ +;;; pyhegp --- Homomorphic encryption of genotypes and phenotypes +;;; Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +;;; +;;; This file is part of pyhegp. +;;; +;;; pyhegp is free software: you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation, either version 3 of the License, or +;;; (at your option) any later version. +;;; +;;; pyhegp is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with pyhegp. If not, see <https://www.gnu.org/licenses/>. + +(define-module (hsmice-test) + #:use-module ((gn packages bioinformatics) #:select (r-genio)) + #:use-module ((gnu packages base) #:select (tar)) + #:use-module ((gnu packages compression) #:select (gzip)) + #:use-module ((gnu packages cran) #:select (r-dplyr r-purrr r-qqman r-stringr)) + #:use-module ((gnu packages python) #:select (python)) + #:use-module ((gnu packages python-science) #:select (python-pandas)) + #:use-module ((gnu packages python-xyz) #:select (python-click)) + #:use-module ((gnu packages statistics) #:select (r r-readr r-tibble r-tidyr)) + #:use-module (guix build-system r) + #:use-module (guix download) + #:use-module (guix gexp) + #:use-module (guix git-download) + #:use-module ((guix licenses) #:prefix license:) + #:use-module (guix packages) + #:use-module (guix profiles) + #:use-module (guix utils) + #:use-module ((pyhegp-package) #:select (pyhegp))) + +;; See details of dataset at +;; https://rdr.ucl.ac.uk/articles/dataset/HSmice_tar_gz/24114471?file=42304248 +(define hsmice-data + (origin + (method url-fetch) + (uri "https://ndownloader.figshare.com/files/42304248") + (file-name "HSmice.tar.gz") + (sha256 + (base32 + "1s6a83r0mll8z2lfv1b94zr2sjdrky5nyq1mpgl8fjjb5s8v2vyx")))) + +(define-public r-mixed-model-gwas + (package + (name "r-mixed-model-gwas") + (version "1.3") + (source (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/encryption4genetics/mixed-model-gwas") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0yv86mw9m981vzl80j100lg05kc6jm5ijhq9b8zcd8f2lr3115db")))) + (build-system r-build-system) + (home-page "https://github.com/encryption4genetics/mixed-model-gwas") + (synopsis "R mixed model GWAS") + (description "@code{r-mixed-model-gwas} implements a mixed model @acronym{GWAS, +genome-wide association study} library for R.") + (license license:gpl3+))) + +(define test-profile + (profile + (content (packages->manifest (list gzip tar pyhegp + python python-click python-pandas + r r-dplyr r-genio + r-mixed-model-gwas r-purrr + r-qqman r-readr r-stringr + r-tibble r-tidyr))))) + +(define wrangle-script + (local-file "../e2e-tests/hsmice/wrangle.r")) + +(define gwas-script + (local-file "../e2e-tests/hsmice/gwas.r")) + +(define check-qtl-script + (local-file "../e2e-tests/hsmice/check-qtl.py")) + +(define hsmice-test-gexp + (with-imported-modules '((guix build utils)) + #~(begin + (use-modules (guix build utils)) + + (mkdir #$output) + (set-path-environment-variable + "PATH" '("/bin") '(#$test-profile)) + (set-path-environment-variable + "GUIX_PYTHONPATH" + '(#$(string-append "/lib/python" + (version-major+minor (package-version python)) + "/site-packages")) + '(#$test-profile)) + (set-path-environment-variable + "R_LIBS_SITE" '("/site-library") '(#$test-profile)) + (invoke "tar" "-xvf" #$hsmice-data + "./HSmice/1_QTL_data/") + (invoke "Rscript" #$wrangle-script "HSmice/1_QTL_data" ".") + + ;; GWAS on plaintext + (invoke "Rscript" #$gwas-script + "genotype.tsv" "phenotype.tsv" + (string-append #$output "/plaintext-pvalues")) + (copy-file "Rplots.pdf" (string-append #$output "/plaintext-manhattan.pdf")) + + ;; GWAS with simple ciphertext data sharing + (invoke "pyhegp" "encrypt" "genotype.tsv" "phenotype.tsv") + (invoke "Rscript" #$gwas-script + "genotype.tsv.hegp" "phenotype.tsv.hegp" + (string-append #$output "/ciphertext-pvalues")) + (copy-file "Rplots.pdf" + (string-append #$output "/ciphertext-manhattan.pdf")) + + ;; Joint federated GWAS + (invoke "pyhegp" "summary" "genotype1.tsv" "-o" "summary1") + (invoke "pyhegp" "summary" "genotype2.tsv" "-o" "summary2") + (invoke "pyhegp" "pool" "-o" "complete-summary" "summary1" "summary2") + (invoke "pyhegp" "encrypt" "-s" "complete-summary" "genotype1.tsv" "phenotype1.tsv") + (invoke "pyhegp" "encrypt" "-s" "complete-summary" "genotype2.tsv" "phenotype2.tsv") + (invoke "pyhegp" "cat-genotype" "-o" "complete-genotype.tsv.hegp" + "genotype1.tsv.hegp" "genotype2.tsv.hegp") + (invoke "pyhegp" "cat-phenotype" "-o" "complete-phenotype.tsv.hegp" + "phenotype1.tsv.hegp" "phenotype2.tsv.hegp") + (invoke "Rscript" #$gwas-script + "complete-genotype.tsv.hegp" "complete-phenotype.tsv.hegp" + (string-append #$output "/federated-ciphertext-pvalues")) + (copy-file "Rplots.pdf" + (string-append #$output "/federated-ciphertext-manhattan.pdf")) + + ;; Check that the QTL is where it should be. + (for-each (lambda (pvalues-file) + (invoke "python3" #$check-qtl-script + (string-append #$output "/" pvalues-file))) + (list "plaintext-pvalues" + "ciphertext-pvalues" + "federated-ciphertext-pvalues"))))) + +(define-public hsmice-test + (computed-file "hsmice-test" hsmice-test-gexp)) + +hsmice-test diff --git a/e2e-tests/hsmice/check-qtl.py b/e2e-tests/hsmice/check-qtl.py new file mode 100644 index 0000000..feae361 --- /dev/null +++ b/e2e-tests/hsmice/check-qtl.py @@ -0,0 +1,27 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see <https://www.gnu.org/licenses/>. + +import sys + +import pandas as pd + +if __name__ == "__main__": + df = pd.read_csv(sys.argv[1], sep="\t") + qtl = df.query("p < 1e-10") + assert (qtl.chromosome == 4).all() + assert ((qtl.position - 137715608).abs() < 2*10**6).all() diff --git a/e2e-tests/hsmice/gwas.r b/e2e-tests/hsmice/gwas.r new file mode 100644 index 0000000..b342748 --- /dev/null +++ b/e2e-tests/hsmice/gwas.r @@ -0,0 +1,72 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see <https://www.gnu.org/licenses/>. + +library(mixed.model.gwas) +library(dplyr) +library(qqman) +library(readr) +library(stringr) +library(tibble) +library(tidyr) + +mmgwas <- function(genotype, phenotype, phenotype_name) { + ## Wrangle phenotype to vector. + phenotype_vector = phenotype %>% pull(phenotype_name) + sample_ids = phenotype %>% pull("sample-id") + names(phenotype_vector) = sample_ids + + ## Wrangle genotype to matrix. + genotype_matrix = genotype %>% + select(all_of(sample_ids)) %>% + as.matrix %>% + t + colnames(genotype_matrix) = genotype %>% + transmute(snp.id=str_c(chromosome, ":", position)) %>% + pull(snp.id) + + ## Compute GWAS. + gwas = mixed.model.gwas(phenotype_vector, genotype_matrix) + return(tibble(chromosome=as.integer(str_split_i(colnames(gwas$pval), ":", 1)), + position=as.integer(str_split_i(colnames(gwas$pval), ":", 2)), + snp.id=colnames(gwas$pval), + p=gwas$pval[1,]) %>% + drop_na()) +} + +run_gwas_experiment <- function(genotype_file, phenotype_file, pvalues_file) { + gwas = mmgwas(read_tsv(genotype_file), + read_tsv(phenotype_file) %>% + select("sample-id", "Biochem.ALP.resid"), + "Biochem.ALP.resid") + write_tsv(gwas %>% select(chromosome, position, p), + pvalues_file) + manhattan(gwas, chr="chromosome", bp="position", snp="snp.id", p="p") +} + +## Process command-line arguments. +args = commandArgs(trailingOnly=TRUE) +if (length(args) != 3) { + write("Usage: Rscript hsmice.r GENOTYPE-FILE PHENOTYPE-FILE PVALUES-FILE", stderr()) + quit(status=1) +} +genotype_file = args[1] +phenotype_file = args[2] +pvalues_file = args[3] + +## Run GWAS. +run_gwas_experiment(genotype_file, phenotype_file, pvalues_file) diff --git a/e2e-tests/hsmice/wrangle.r b/e2e-tests/hsmice/wrangle.r new file mode 100644 index 0000000..f47cf23 --- /dev/null +++ b/e2e-tests/hsmice/wrangle.r @@ -0,0 +1,80 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net> +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see <https://www.gnu.org/licenses/>. + +library(dplyr) +library(genio) +library(purrr) +library(readr) +library(tibble) +library(tidyr) + +mean_sans_rm = partial(mean, na.rm=TRUE) + +## Process command-line arguments. +args = commandArgs(trailingOnly=TRUE) +if (length(args) != 2) { + write("Usage: Rscript hsmice.r INPUT_DIRECTORY OUTPUT_DIRECTORY", stderr()) + quit(status=1) +} +input_data_directory = args[1] +output_directory = args[2] + +## Read phenotype data. +phenotype = read.table(file.path(input_data_directory, "HSmice.phe"), + header=TRUE) %>% + select(id, sex, Anx.resid, BurrowedPelletWeight.resid, Context.resid, + End.Weight.resid, Explore.resid, FN.preWeight.resid, Freeze.resid, + Glucose.Weight.resid, OFT.Boli.resid, OFT.CenterTime.resid, + OFT.Latency.resid, OFT.TotalActivity.resid, Start.Weight.resid, + Weight.GrowthSlope.resid, Biochem.ALP.resid) %>% + drop_na() %>% + rename("sample-id"="id") +sample_ids = phenotype %>% pull("sample-id") + +## Read genotype data, replace NAs by mean, and subset. +p = read_plink(file.path(input_data_directory, "HSmice.bed")) +genotype_matrix = t(apply(p$X, 1, function(x) ifelse(is.na(x), mean_sans_rm(x), x))) %>% + data.frame() %>% + select(all_of(sample_ids)) +genotype = bind_cols(data.frame(chromosome=p$bim$chr, + position=p$bim$pos, + reference=p$bim$ref), + genotype_matrix) + +## Write whole phenotype and the pieces. +midpoint = length(sample_ids) %/% 2 +sample_ids1 = sample_ids[1:midpoint] +sample_ids2 = sample_ids[(midpoint+1):length(sample_ids)] +write_tsv(phenotype, + file.path(output_directory, "phenotype.tsv")) +write_tsv(phenotype %>% + inner_join(tibble("sample-id"=sample_ids1)), + file.path(output_directory, "phenotype1.tsv")) +write_tsv(phenotype %>% + inner_join(tibble("sample-id"=sample_ids2)), + file.path(output_directory, "phenotype2.tsv")) + +## Write whole genotype and the pieces. +write_tsv(genotype, + file.path(output_directory, "genotype.tsv")) +write_tsv(genotype %>% + select(chromosome, position, reference, all_of(sample_ids1)), + file.path(output_directory, "genotype1.tsv")) +write_tsv(genotype %>% + select(chromosome, position, reference, all_of(sample_ids2)), + file.path(output_directory, "genotype2.tsv")) |