about summary refs log tree commit diff
path: root/e2e-tests/hsmice/wrangle.r
diff options
context:
space:
mode:
Diffstat (limited to 'e2e-tests/hsmice/wrangle.r')
-rw-r--r--e2e-tests/hsmice/wrangle.r80
1 files changed, 80 insertions, 0 deletions
diff --git a/e2e-tests/hsmice/wrangle.r b/e2e-tests/hsmice/wrangle.r
new file mode 100644
index 0000000..f47cf23
--- /dev/null
+++ b/e2e-tests/hsmice/wrangle.r
@@ -0,0 +1,80 @@
+### pyhegp --- Homomorphic encryption of genotypes and phenotypes
+### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
+###
+### This file is part of pyhegp.
+###
+### pyhegp is free software: you can redistribute it and/or modify it
+### under the terms of the GNU General Public License as published by
+### the Free Software Foundation, either version 3 of the License, or
+### (at your option) any later version.
+###
+### pyhegp is distributed in the hope that it will be useful, but
+### WITHOUT ANY WARRANTY; without even the implied warranty of
+### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+### General Public License for more details.
+###
+### You should have received a copy of the GNU General Public License
+### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.
+
+library(dplyr)
+library(genio)
+library(purrr)
+library(readr)
+library(tibble)
+library(tidyr)
+
+mean_sans_rm = partial(mean, na.rm=TRUE)
+
+## Process command-line arguments.
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) != 2) {
+    write("Usage: Rscript hsmice.r INPUT_DIRECTORY OUTPUT_DIRECTORY", stderr())
+    quit(status=1)
+}
+input_data_directory = args[1]
+output_directory = args[2]
+
+## Read phenotype data.
+phenotype = read.table(file.path(input_data_directory, "HSmice.phe"),
+                       header=TRUE) %>%
+    select(id, sex, Anx.resid, BurrowedPelletWeight.resid, Context.resid,
+           End.Weight.resid, Explore.resid, FN.preWeight.resid, Freeze.resid,
+           Glucose.Weight.resid, OFT.Boli.resid, OFT.CenterTime.resid,
+           OFT.Latency.resid, OFT.TotalActivity.resid, Start.Weight.resid,
+           Weight.GrowthSlope.resid, Biochem.ALP.resid) %>%
+    drop_na() %>%
+    rename("sample-id"="id")
+sample_ids = phenotype %>% pull("sample-id")
+
+## Read genotype data, replace NAs by mean, and subset.
+p = read_plink(file.path(input_data_directory, "HSmice.bed"))
+genotype_matrix = t(apply(p$X, 1, function(x) ifelse(is.na(x), mean_sans_rm(x), x))) %>%
+    data.frame() %>%
+    select(all_of(sample_ids))
+genotype = bind_cols(data.frame(chromosome=p$bim$chr,
+                                position=p$bim$pos,
+                                reference=p$bim$ref),
+                     genotype_matrix)
+
+## Write whole phenotype and the pieces.
+midpoint = length(sample_ids) %/% 2
+sample_ids1 = sample_ids[1:midpoint]
+sample_ids2 = sample_ids[(midpoint+1):length(sample_ids)]
+write_tsv(phenotype,
+          file.path(output_directory, "phenotype.tsv"))
+write_tsv(phenotype %>%
+          inner_join(tibble("sample-id"=sample_ids1)),
+          file.path(output_directory, "phenotype1.tsv"))
+write_tsv(phenotype %>%
+          inner_join(tibble("sample-id"=sample_ids2)),
+          file.path(output_directory, "phenotype2.tsv"))
+
+## Write whole genotype and the pieces.
+write_tsv(genotype,
+          file.path(output_directory, "genotype.tsv"))
+write_tsv(genotype %>%
+          select(chromosome, position, reference, all_of(sample_ids1)),
+          file.path(output_directory, "genotype1.tsv"))
+write_tsv(genotype %>%
+          select(chromosome, position, reference, all_of(sample_ids2)),
+          file.path(output_directory, "genotype2.tsv"))