# Quantitative Genomics and Genetics 2016

## Homework 6 R-code Key

â€“ 7 May 2016

â€“ Author: Jin Hyun Ju (jj328@cornell.edu)

### Problem 2

#### b) MLE for the NULL hypothesis

The MLE for the NULL hypothesis is the same for all genotypes, since it is assuming that the $$\beta_a$$ values are 0 thus only requiring estimating the value for $$\beta_{\mu}$$.

library(MASS)
library(ggplot2)

W_calc <- function(gamma_inv){
N <- length(gamma_inv)
W<-diag(as.vector(gamma_inv * (1- gamma_inv)))

return(W)
}

beta_update <- function(X_mx, W, Y, gamma_inv, beta){
beta_up <- beta + ginv(t(X_mx)%*%W%*%X_mx)%*%t(X_mx)%*%(Y-gamma_inv)
return(beta_up)
}

gamma_inv_calc <- function(X_mx, beta_t){
#initialize gamma
# K is the part which goes into the exponent
K <- X_mx %*% beta_t
gamma_inv <- exp(K)/(1+exp(K))
return(gamma_inv)
}

dev_calc <- function(Y, gamma_inv){
deviance <- 2*( sum(Y[Y==1]*log(Y[Y==1]/gamma_inv[Y==1])) + sum((1-Y[Y==0])*log((1-Y[Y==0])/(1-gamma_inv[Y==0]))) )
return(deviance)
}

loglik_calc <- function(Y, gamma_inv){
loglik <- sum(Y*log(gamma_inv)+(1-Y)*log(1-gamma_inv))
return(loglik)
}

logistic.IRLS<- function(Xa,Xd,Y =Y, beta.initial.vec = c(0,0,0), d.stop.th = 1e-6, it.max = 100) {

#Create the X matrix
X_mx <- cbind(rep(1,nrow(Y)), Xa, Xd)

#check this matrix:
#initialize the beta parameter vector at t=0
beta_t <- beta.initial.vec

# initialize deviance at d[t]
dt <- 0

#initialize gamma
# K is the part which goes into the exponent
gamma_inv <- gamma_inv_calc(X_mx, beta_t)

for(i in 1:it.max) {
dpt1 <- dt #store previous deviance

# create empty matrix W
W <- W_calc(gamma_inv)

beta_t <- beta_update(X_mx, W, Y, gamma_inv, beta_t)

#update gamma since it's a function of beta

gamma_inv <- gamma_inv_calc(X_mx, beta_t)

#calculate new deviance
dt <- dev_calc(Y, gamma_inv)

absD <- abs(dt - dpt1)

if(absD < d.stop.th) {
#cat("Convergence at iteration:", i, "at threshold:", d.stop.th, "\n")
logl <- loglik_calc(Y, gamma_inv)
return(list(beta_t,logl))
}
}
#cat("Convergence not reached after iteration:", i, "at threshold:", d.stop.th, "\n")
return(list(beta_t= c(NA,NA,NA),logl=NA))
}
Y <- read.table("./QG16_HW6_phenotypes.txt", header = F,stringsAsFactors = F)

Y <- as.matrix(Y)
colnames(Y) <- NULL

# log likelihood for NULL hypothesis
mle_null <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[1]]

cat("MLE beta_mu for the NULL hypothesis\n")
MLE beta_mu for the NULL hypothesis
print(mle_null)
         [,1]
[1,] 0.268264

#### c) MLE for the NULL Alternative

beta_mle <-NULL
logl<-NULL

for(j in 1:dim(geno)[2]){
myList<-logistic.IRLS(Xa = geno[,j], Xd = NULL,Y=Y, beta.initial.vec = c(0,0))
beta_mle <-cbind(beta_mle,myList[[1]])
}

cat("MLE values for Alternative Hypothesis\n")
MLE values for Alternative Hypothesis
print(beta_mle)
           [,1]       [,2]      [,3]       [,4]      [,5]        [,6]
[1,]  0.2275511 0.27485364 0.2445954  0.3376320 0.3810119  0.25812157
[2,] -0.1763137 0.02852598 0.1164463 -0.1001963 0.3209759 -0.01845573
[,7]      [,8]      [,9]      [,10]      [,11]        [,12]
[1,]  0.06831978 0.1582095 0.2896401  0.1615050  0.2069594  0.003697884
[2,] -3.11255759 3.2906927 1.6343063 -0.3117967 -0.5064454 -0.550509833
[,13]     [,14]       [,15]
[1,] 0.2633712 0.2551295  0.23294695
[2,] 0.2696215 0.2195266 -0.07654278

#### e) LRT for each marker

logl_H0 <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[2]]

for(j in 1:dim(geno)[2]){
myList<-logistic.IRLS(Xa = geno[,j], Xd = NULL,Y=Y, beta.initial.vec = c(0,0))
logl<-c(logl,myList[[2]])

}
LRT<-2*logl-2*logl_H0 #likelihood ratio test statistic

cat("LRT for each marker\n")
LRT for each marker
print(LRT)
 [1] 1.192766e+00 3.137203e-02 4.502196e-01 2.298408e-01 3.442340e+00
[6] 9.230014e-03 1.433018e+02 1.493237e+02 7.427857e+01 2.956867e+00
[11] 9.418434e+00 8.585346e+00 2.545098e+00 1.680166e+00 1.710400e-01

#### f) p-values for each marker

cat("LRT for each marker\n")
LRT for each marker
pval <- pchisq(LRT, 2, lower.tail = F)
print(pval)
 [1] 5.508003e-01 9.844364e-01 7.984286e-01 8.914371e-01 1.788567e-01
[6] 9.953956e-01 7.627998e-32 3.756313e-33 7.423592e-17 2.279946e-01
[11] 9.011829e-03 1.366834e-02 2.801167e-01 4.316746e-01 9.180348e-01
cat("Most significant hit #", which.min(pval),"=",pval[which.min(pval)],"\n")
Most significant hit # 8 = 3.756313e-33 

#### g) manhattan plot

plot(-log10(pval), main = "Manhattan plot")

#### h) QQ plots

expected_pvals <- sort(runif(length(pval)), decreasing = FALSE)
observed_pvals <- sort(pval, decreasing = FALSE)

plot(-log10(expected_pvals), -log10(observed_pvals), main = "QQplot for all genotypes")
abline( a= 0 , b = 1, col = "red")