# Quantitative Genomics and Genetics 2016

Midterm R-code Key

â€“ 4 April 2016

â€“ Author: Jin Hyun Ju (jj328@cornell.edu)

### Problem 3

#### a)

``````pop_mean <- 1.5
pop_var <- 1.3

M <- 10000
N <- 20

sampling_function<- function(number_of_samples = 1000,
sample_size = 100,
input_mean = 0,
input_var = 1){

sample_matrix <- matrix(NA, nrow = number_of_samples, ncol = sample_size)

for ( i in 1:number_of_samples ){
sample_matrix[i,] <- rnorm(sample_size,
mean = input_mean,
sd = sqrt(input_var))
}
return(sample_matrix)

}

likelihood_normal <- function(input_mean, input_sd, x){
temp = 1
for ( i in x){
temp = temp * (1/ sqrt( 2 * pi * input_sd^2)) * exp( - ((i - input_mean) ^ 2) / (2 * input_sd^2))
}
return(temp)
}

sample_a <- sampling_function(number_of_samples = M, sample_size = N, input_mean = 1, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_a)){
x <- sample_a[i,]
sd_mle <- sqrt(mean((x-mean(x))^2))
like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
like_ratio <- -2 * log(like_null / like_alt)
lrt_vector <- c(lrt_vector, like_ratio)

}

hist(lrt_vector, main = "Problem 3 (a)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")``````
``-2ln values greater than 3.841459 =  489 ``
``````pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)``````

#### b)

``````N <- 100
sample_b <- sampling_function(number_of_samples = M, sample_size = N, input_mean = 1, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_b)){
x <- sample_b[i,]
sd_mle <- sqrt(mean((x-mean(x))^2))
like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
like_ratio <- -2 * log(like_null / like_alt)
lrt_vector <- c(lrt_vector, like_ratio)

}

hist(lrt_vector, main = "Problem 3 (b)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")``````
``-2ln values greater than 3.841459 =  471 ``
``````pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)``````

#### c)

``````N <- 20

sample_c <- sampling_function(number_of_samples = M, sample_size = N, input_mean = pop_mean, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_c)){
x <- sample_c[i,]
sd_mle <- sqrt(mean((x-mean(x))^2))
like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
like_ratio <- -2 * log(like_null / like_alt)
lrt_vector <- c(lrt_vector, like_ratio)

}

hist(lrt_vector, main = "Problem 3 (c)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")``````
``-2ln values greater than 3.841459 =  4990 ``
``````pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)``````

#### d)

``````N <- 100

sample_d <- sampling_function(number_of_samples = M, sample_size = N, input_mean = pop_mean, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_d)){
x <- sample_d[i,]
sd_mle <- sqrt(mean((x-mean(x))^2))
like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
like_ratio <- -2 * log(like_null / like_alt)
lrt_vector <- c(lrt_vector, like_ratio)

}

hist(lrt_vector, main = "Problem 3 (d)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")``````
``-2ln values greater than 3.841459 =  9915 ``
``````pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)``````

### Problem 6

``````phenotypes <- read.table("QG16_phenotypes_midterm.txt", stringsAsFactors = FALSE, row.names = 1)
midterm_genotypes <- read.table("QG16_genotypes_midterm.txt", stringsAsFactors = FALSE)

phenotypes <- as.matrix(phenotypes)

hist(phenotypes, main = "Histogram of Midterm Phenotypes")

outlier_pheno <- which.max(phenotypes)
cat("Outlier phenotypes = ", rownames(phenotypes)[outlier_pheno], "\n")``````
``Outlier phenotypes =  sample_222 ``
``````filtered_phenotypes <- phenotypes[-outlier_pheno, ]

filtered_genotypes <- midterm_genotypes[-outlier_pheno,]

hist(filtered_phenotypes, main = "Phenotypes after outlier removal")``````

### Problem 7

``cat("Identifying individuals with missing genotype data\n")``
``Identifying individuals with missing genotype data``
``which(filtered_genotypes == 9, arr.ind = TRUE)``
``````           row  col
sample_67   67 1167
sample_110 110 1820
sample_180 180 3540
sample_230 229 5100``````
``````missing_geno_names <- colnames(filtered_genotypes)[(which(filtered_genotypes == 9, arr.ind = TRUE)[,2])]
missing_geno <- sapply(strsplit(missing_geno_names, "_"), function(x) x[1])

cat("Genotypes with missing data: \n")``````
``Genotypes with missing data: ``
``print(missing_geno)``
``[1] "geno584"  "geno910"  "geno1770" "geno2550"``
``````missing_idx <- c()

for( m in missing_geno){
missing_idx <- c(missing_idx, grep(m, colnames(filtered_genotypes)))
}

no_missing_geno <- as.matrix(filtered_genotypes[,-missing_idx])``````

### Problem 8

#### a)

``````library(MASS)

xa_converter <- function(geno_in){
geno_count <- table(geno_in)
minor_allele <- names(geno_count[geno_count == min(geno_count)])
xa_code <- ifelse(geno_in == minor_allele, 1,0)
xa_result <- rowSums(xa_code) - 1
return(xa_result)
}

MLE_calculator <- function(pheno_input, xa_input, xd_input){
n_samples <- length(xa_input)

X_mx <- cbind(1,xa_input,xd_input)

MLE_beta <- ginv(t(X_mx) %*% X_mx) %*% t(X_mx) %*% pheno_input

return(MLE_beta)
}

xa_matrix <- matrix(NA,
nrow = nrow(no_missing_geno),
ncol = ncol(no_missing_geno)/2)

colnames(xa_matrix) <- unique(sapply(strsplit(colnames(no_missing_geno), "_"), function(x) x[1]))

for (i in 1:(ncol(no_missing_geno)/2)){
xa_matrix[,i] <- xa_converter(no_missing_geno[,c(2*i -1, 2*i)])
}

xd_matrix <- 1 - (2 * abs(xa_matrix))

mle_matrix <- matrix(NA, nrow = ncol(xa_matrix), ncol = 3)

for(k in 1:ncol(xa_matrix)){

mle_matrix[k,] <- MLE_calculator(filtered_phenotypes, xa_matrix[,k], xd_matrix[,k])

}

par(mfrow = c(1,3))
hist(mle_matrix[,1], main = "MLE beta_mu")
hist(mle_matrix[,2], main = "MLE beta_a")
hist(mle_matrix[,3], main = "MLE beta_d")``````