Quantitative Genomics and Genetics 2016

Homework 5 R-code Key

– 11 March 2016

– Author: Jin Hyun Ju (jj328@cornell.edu)

Problem 2

a) Simulating genotype data

genotype_simulator <- function(n_sample, 
                               n_geno){
  
    geno_mx <- matrix(NA, nrow = n_sample, ncol = n_geno)
    
    for (i in 1:n_geno){
      
        geno_mx[,i] <- sample(c("AA", "AB", "BB"), n_sample, prob = c(1/4, 1/2, 1/4), replace = TRUE)
        
    }
    
    return(geno_mx)
    
}

geno_mx <- genotype_simulator(200, 250)

b) Converting simulated genotypes to Xa and Xd

conversion_vec <- c("AA" = -1, "AB" = 0, "BB" = 1)

xa_matrix <- apply(geno_mx, 2, function(x) conversion_vec[x])

xd_matrix <- 1 - 2*(abs(xa_matrix))

c) Simulating phenotype

beta_mu <- 1

beta_a_25 <- -0.2
beta_d_25 <- 0
beta_a_100 <- 0
beta_d_100 <- -0.2
beta_a_175 <- 0.2
beta_d_175 <- 0.2
beta_a_225 <- 0
beta_d_225 <- 0

error_sd <- 1

pheno <- beta_mu + xa_matrix[,25] * beta_a_25 + xd_matrix[,25] * beta_d_25 + xa_matrix[,100] * beta_a_100 + xd_matrix[,100] * beta_d_100 + xa_matrix[,175] * beta_a_175 + xd_matrix[,175] * beta_d_175 + xa_matrix[,225] * beta_a_225 + xd_matrix[,225] * beta_d_225 + rnorm(nrow(xa_matrix), 0, error_sd)

d) Plotting a histogram of the simulated phenotype

hist(pheno, main = "Simulated phenotype", ylab = "count", xlab = "pheotype")

e) X-Y plots for genotypes and phenotypes

par(mfrow = c(4, 2))
geno_idx <- c(25, 100, 175, 225)

for (i in geno_idx){
  
    plot(xa_matrix[,i], pheno, xlab = paste0("Genotype (Xa) ",i), ylab = "phenotype")
    plot(xd_matrix[,i], pheno, xlab = paste0("Genotype (Xd) ",i), ylab = "phenotype")
}

f) MLE(beta) calculations

library(MASS)

MLE_calculator <- function(pheno_input, xa_input, xd_input){
    n_samples <- length(xa_input)

    X_mx <- cbind(1,xa_input,xd_input)
    
    MLE_beta <- ginv(t(X_mx) %*% X_mx) %*% t(X_mx) %*% pheno_input

    return(MLE_beta)
}


mle_matrix <- matrix(NA, nrow = ncol(xa_matrix), ncol = 3)

colnames(mle_matrix) <- c("beta_mu", "beta_a", "beta_d")

for (i in 1:ncol(xa_matrix)){
      mle_matrix[i,] <- MLE_calculator(pheno, xa_matrix[,i], xd_matrix[,i])
  
}

par(mfrow = c(1,3))

for( j in 1:ncol(mle_matrix)){
  
    hist(mle_matrix[,j], main = colnames(mle_matrix)[j], xlab = "") 
  
}

f) P-value calculations

fstat_calculator <- function(pheno_input, xa_input, xd_input){
    n_samples <- length(xa_input)

    X_mx <- cbind(1,xa_input,xd_input)
    
    MLE_beta <- ginv(t(X_mx) %*% X_mx) %*% t(X_mx) %*% pheno_input
    y_hat <- X_mx %*% MLE_beta
  
    SSM <- sum((y_hat - mean(pheno_input))^2)
    SSE <- sum((pheno_input - y_hat)^2)
  
    df_M <- 2
    df_E <- n_samples - 3 
  
    MSM <- SSM / df_M
    MSE <- SSE / df_E
  
    Fstatistic <- MSM / MSE
  
    return(Fstatistic)
}


f_stat <- c()

for( i in 1:ncol(xa_matrix)){
  
    f_stat[i] <- fstat_calculator(pheno, xa_matrix[,i], xd_matrix[,i])  
}

h) P-value calculations

pvals <- pf(f_stat, 2, 197, lower.tail = FALSE)

plot(-log10(pvals), main = "Manhattan Plot")

abline(v = 25, col = 'red')
abline(v = 100, col = 'red')
abline(v = 175, col = 'red')
abline(v = 225, col = 'red')

i) Identifying significant hits

which(pvals < 0.05)
 [1]   3  36  40  51  54  63  66 100 105 112 134 143 175 200 210 238

j) Bonferroni corrected p-values

which(pvals < (0.05 / 250))
integer(0)