e2e-tests/hsmice/wrangle.r


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

### pyhegp --- Homomorphic encryption of genotypes and phenotypes
### Copyright © 2025 Arun Isaac <arunisaac@systemreboot.net>
###
### This file is part of pyhegp.
###
### pyhegp is free software: you can redistribute it and/or modify it
### under the terms of the GNU General Public License as published by
### the Free Software Foundation, either version 3 of the License, or
### (at your option) any later version.
###
### pyhegp is distributed in the hope that it will be useful, but
### WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
### General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with pyhegp. If not, see <https://www.gnu.org/licenses/>.

library(dplyr)
library(genio)
library(purrr)
library(readr)
library(tibble)
library(tidyr)

mean_sans_rm = partial(mean, na.rm=TRUE)

## Process command-line arguments.
args = commandArgs(trailingOnly=TRUE)
if (length(args) != 2) {
    write("Usage: Rscript hsmice.r INPUT_DIRECTORY OUTPUT_DIRECTORY", stderr())
    quit(status=1)
}
input_data_directory = args[1]
output_directory = args[2]

## Read phenotype data.
phenotype = read.table(file.path(input_data_directory, "HSmice.phe"),
                       header=TRUE) %>%
    select(id, sex, Anx.resid, BurrowedPelletWeight.resid, Context.resid,
           End.Weight.resid, Explore.resid, FN.preWeight.resid, Freeze.resid,
           Glucose.Weight.resid, OFT.Boli.resid, OFT.CenterTime.resid,
           OFT.Latency.resid, OFT.TotalActivity.resid, Start.Weight.resid,
           Weight.GrowthSlope.resid, Biochem.ALP.resid) %>%
    drop_na() %>%
    rename("sample-id"="id")
sample_ids = phenotype %>% pull("sample-id")

## Read genotype data, replace NAs by mean, and subset.
p = read_plink(file.path(input_data_directory, "HSmice.bed"))
genotype_matrix = t(apply(p$X, 1, function(x) ifelse(is.na(x), mean_sans_rm(x), x))) %>%
    data.frame() %>%
    select(all_of(sample_ids))
genotype = bind_cols(data.frame(chromosome=p$bim$chr,
                                position=p$bim$pos,
                                reference=p$bim$ref),
                     genotype_matrix)

## Write whole phenotype and the pieces.
midpoint = length(sample_ids) %/% 2
sample_ids1 = sample_ids[1:midpoint]
sample_ids2 = sample_ids[(midpoint+1):length(sample_ids)]
write_tsv(phenotype,
          file.path(output_directory, "phenotype.tsv"))
write_tsv(phenotype %>%
          inner_join(tibble("sample-id"=sample_ids1)),
          file.path(output_directory, "phenotype1.tsv"))
write_tsv(phenotype %>%
          inner_join(tibble("sample-id"=sample_ids2)),
          file.path(output_directory, "phenotype2.tsv"))

## Write whole genotype and the pieces.
write_tsv(genotype,
          file.path(output_directory, "genotype.tsv"))
write_tsv(genotype %>%
          select(chromosome, position, reference, all_of(sample_ids1)),
          file.path(output_directory, "genotype1.tsv"))
write_tsv(genotype %>%
          select(chromosome, position, reference, all_of(sample_ids2)),
          file.path(output_directory, "genotype2.tsv"))