From b89ecd37fe19fdb31beebabd2796b74f5dc97743 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Sat, 6 Sep 2025 18:16:28 +0100 Subject: Add end-to-end tests for hsmice dataset. * Add hsmice dataset wrangling and test scripts. * Add G-expression script to run test. * Depend on the guix-bioinformatics Guix channel for r-genio. --- e2e-tests/hsmice/check-qtl.py | 27 +++++++++++++++ e2e-tests/hsmice/gwas.r | 72 ++++++++++++++++++++++++++++++++++++++ e2e-tests/hsmice/wrangle.r | 80 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+) create mode 100644 e2e-tests/hsmice/check-qtl.py create mode 100644 e2e-tests/hsmice/gwas.r create mode 100644 e2e-tests/hsmice/wrangle.r (limited to 'e2e-tests/hsmice') diff --git a/e2e-tests/hsmice/check-qtl.py b/e2e-tests/hsmice/check-qtl.py new file mode 100644 index 0000000..feae361 --- /dev/null +++ b/e2e-tests/hsmice/check-qtl.py @@ -0,0 +1,27 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see . + +import sys + +import pandas as pd + +if __name__ == "__main__": + df = pd.read_csv(sys.argv[1], sep="\t") + qtl = df.query("p < 1e-10") + assert (qtl.chromosome == 4).all() + assert ((qtl.position - 137715608).abs() < 2*10**6).all() diff --git a/e2e-tests/hsmice/gwas.r b/e2e-tests/hsmice/gwas.r new file mode 100644 index 0000000..b342748 --- /dev/null +++ b/e2e-tests/hsmice/gwas.r @@ -0,0 +1,72 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see . + +library(mixed.model.gwas) +library(dplyr) +library(qqman) +library(readr) +library(stringr) +library(tibble) +library(tidyr) + +mmgwas <- function(genotype, phenotype, phenotype_name) { + ## Wrangle phenotype to vector. + phenotype_vector = phenotype %>% pull(phenotype_name) + sample_ids = phenotype %>% pull("sample-id") + names(phenotype_vector) = sample_ids + + ## Wrangle genotype to matrix. + genotype_matrix = genotype %>% + select(all_of(sample_ids)) %>% + as.matrix %>% + t + colnames(genotype_matrix) = genotype %>% + transmute(snp.id=str_c(chromosome, ":", position)) %>% + pull(snp.id) + + ## Compute GWAS. + gwas = mixed.model.gwas(phenotype_vector, genotype_matrix) + return(tibble(chromosome=as.integer(str_split_i(colnames(gwas$pval), ":", 1)), + position=as.integer(str_split_i(colnames(gwas$pval), ":", 2)), + snp.id=colnames(gwas$pval), + p=gwas$pval[1,]) %>% + drop_na()) +} + +run_gwas_experiment <- function(genotype_file, phenotype_file, pvalues_file) { + gwas = mmgwas(read_tsv(genotype_file), + read_tsv(phenotype_file) %>% + select("sample-id", "Biochem.ALP.resid"), + "Biochem.ALP.resid") + write_tsv(gwas %>% select(chromosome, position, p), + pvalues_file) + manhattan(gwas, chr="chromosome", bp="position", snp="snp.id", p="p") +} + +## Process command-line arguments. +args = commandArgs(trailingOnly=TRUE) +if (length(args) != 3) { + write("Usage: Rscript hsmice.r GENOTYPE-FILE PHENOTYPE-FILE PVALUES-FILE", stderr()) + quit(status=1) +} +genotype_file = args[1] +phenotype_file = args[2] +pvalues_file = args[3] + +## Run GWAS. +run_gwas_experiment(genotype_file, phenotype_file, pvalues_file) diff --git a/e2e-tests/hsmice/wrangle.r b/e2e-tests/hsmice/wrangle.r new file mode 100644 index 0000000..f47cf23 --- /dev/null +++ b/e2e-tests/hsmice/wrangle.r @@ -0,0 +1,80 @@ +### pyhegp --- Homomorphic encryption of genotypes and phenotypes +### Copyright © 2025 Arun Isaac +### +### This file is part of pyhegp. +### +### pyhegp is free software: you can redistribute it and/or modify it +### under the terms of the GNU General Public License as published by +### the Free Software Foundation, either version 3 of the License, or +### (at your option) any later version. +### +### pyhegp is distributed in the hope that it will be useful, but +### WITHOUT ANY WARRANTY; without even the implied warranty of +### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +### General Public License for more details. +### +### You should have received a copy of the GNU General Public License +### along with pyhegp. If not, see . + +library(dplyr) +library(genio) +library(purrr) +library(readr) +library(tibble) +library(tidyr) + +mean_sans_rm = partial(mean, na.rm=TRUE) + +## Process command-line arguments. +args = commandArgs(trailingOnly=TRUE) +if (length(args) != 2) { + write("Usage: Rscript hsmice.r INPUT_DIRECTORY OUTPUT_DIRECTORY", stderr()) + quit(status=1) +} +input_data_directory = args[1] +output_directory = args[2] + +## Read phenotype data. +phenotype = read.table(file.path(input_data_directory, "HSmice.phe"), + header=TRUE) %>% + select(id, sex, Anx.resid, BurrowedPelletWeight.resid, Context.resid, + End.Weight.resid, Explore.resid, FN.preWeight.resid, Freeze.resid, + Glucose.Weight.resid, OFT.Boli.resid, OFT.CenterTime.resid, + OFT.Latency.resid, OFT.TotalActivity.resid, Start.Weight.resid, + Weight.GrowthSlope.resid, Biochem.ALP.resid) %>% + drop_na() %>% + rename("sample-id"="id") +sample_ids = phenotype %>% pull("sample-id") + +## Read genotype data, replace NAs by mean, and subset. +p = read_plink(file.path(input_data_directory, "HSmice.bed")) +genotype_matrix = t(apply(p$X, 1, function(x) ifelse(is.na(x), mean_sans_rm(x), x))) %>% + data.frame() %>% + select(all_of(sample_ids)) +genotype = bind_cols(data.frame(chromosome=p$bim$chr, + position=p$bim$pos, + reference=p$bim$ref), + genotype_matrix) + +## Write whole phenotype and the pieces. +midpoint = length(sample_ids) %/% 2 +sample_ids1 = sample_ids[1:midpoint] +sample_ids2 = sample_ids[(midpoint+1):length(sample_ids)] +write_tsv(phenotype, + file.path(output_directory, "phenotype.tsv")) +write_tsv(phenotype %>% + inner_join(tibble("sample-id"=sample_ids1)), + file.path(output_directory, "phenotype1.tsv")) +write_tsv(phenotype %>% + inner_join(tibble("sample-id"=sample_ids2)), + file.path(output_directory, "phenotype2.tsv")) + +## Write whole genotype and the pieces. +write_tsv(genotype, + file.path(output_directory, "genotype.tsv")) +write_tsv(genotype %>% + select(chromosome, position, reference, all_of(sample_ids1)), + file.path(output_directory, "genotype1.tsv")) +write_tsv(genotype %>% + select(chromosome, position, reference, all_of(sample_ids2)), + file.path(output_directory, "genotype2.tsv")) -- cgit 1.4.1