Quantitative Genomics and Genetics 2016

Homework 6 R-code Key

– 7 May 2016

– Author: Jin Hyun Ju (jj328@cornell.edu)

Problem 2

b) MLE for the NULL hypothesis

The MLE for the NULL hypothesis is the same for all genotypes, since it is assuming that the \(\beta_a\) values are 0 thus only requiring estimating the value for \(\beta_{\mu}\).

library(MASS)
library(ggplot2)

W_calc <- function(gamma_inv){
    N <- length(gamma_inv)
        W<-diag(as.vector(gamma_inv * (1- gamma_inv)))
      
    return(W)
}

beta_update <- function(X_mx, W, Y, gamma_inv, beta){
  beta_up <- beta + ginv(t(X_mx)%*%W%*%X_mx)%*%t(X_mx)%*%(Y-gamma_inv)
    return(beta_up)
}

gamma_inv_calc <- function(X_mx, beta_t){
    #initialize gamma
    # K is the part which goes into the exponent
    K <- X_mx %*% beta_t
    gamma_inv <- exp(K)/(1+exp(K))
    return(gamma_inv)
}

dev_calc <- function(Y, gamma_inv){
    deviance <- 2*( sum(Y[Y==1]*log(Y[Y==1]/gamma_inv[Y==1])) + sum((1-Y[Y==0])*log((1-Y[Y==0])/(1-gamma_inv[Y==0]))) )  
    return(deviance)
}

loglik_calc <- function(Y, gamma_inv){
    loglik <- sum(Y*log(gamma_inv)+(1-Y)*log(1-gamma_inv))
    return(loglik)
}

logistic.IRLS<- function(Xa,Xd,Y =Y, beta.initial.vec = c(0,0,0), d.stop.th = 1e-6, it.max = 100) {

  #Create the X matrix
  X_mx <- cbind(rep(1,nrow(Y)), Xa, Xd)
  
  #check this matrix:
    #initialize the beta parameter vector at t=0
    beta_t <- beta.initial.vec
  
  # initialize deviance at d[t]
    dt <- 0
    
    #initialize gamma
  # K is the part which goes into the exponent
  gamma_inv <- gamma_inv_calc(X_mx, beta_t)
    
    for(i in 1:it.max) {
        dpt1 <- dt #store previous deviance
        
    # create empty matrix W
        W <- W_calc(gamma_inv)
    
        beta_t <- beta_update(X_mx, W, Y, gamma_inv, beta_t)
        
        #update gamma since it's a function of beta
        
        gamma_inv <- gamma_inv_calc(X_mx, beta_t)

        #calculate new deviance
        dt <- dev_calc(Y, gamma_inv)
        
        absD <- abs(dt - dpt1)
        
        if(absD < d.stop.th) {
            #cat("Convergence at iteration:", i, "at threshold:", d.stop.th, "\n")
            logl <- loglik_calc(Y, gamma_inv)
            return(list(beta_t,logl))
        }   
    }
    #cat("Convergence not reached after iteration:", i, "at threshold:", d.stop.th, "\n")
    return(list(beta_t= c(NA,NA,NA),logl=NA))
}
Y <- read.table("./QG16_HW6_phenotypes.txt", header = F,stringsAsFactors = F)
geno <- read.table("./QG16_HW6_genotypes.txt", header = F)

Y <- as.matrix(Y)
colnames(Y) <- NULL


# log likelihood for NULL hypothesis
mle_null <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[1]]

cat("MLE beta_mu for the NULL hypothesis\n")
MLE beta_mu for the NULL hypothesis
print(mle_null)
         [,1]
[1,] 0.268264

c) MLE for the NULL Alternative

beta_mle <-NULL
logl<-NULL

for(j in 1:dim(geno)[2]){
    myList<-logistic.IRLS(Xa = geno[,j], Xd = NULL,Y=Y, beta.initial.vec = c(0,0))
    beta_mle <-cbind(beta_mle,myList[[1]])
}

cat("MLE values for Alternative Hypothesis\n")
MLE values for Alternative Hypothesis
print(beta_mle)
           [,1]       [,2]      [,3]       [,4]      [,5]        [,6]
[1,]  0.2275511 0.27485364 0.2445954  0.3376320 0.3810119  0.25812157
[2,] -0.1763137 0.02852598 0.1164463 -0.1001963 0.3209759 -0.01845573
            [,7]      [,8]      [,9]      [,10]      [,11]        [,12]
[1,]  0.06831978 0.1582095 0.2896401  0.1615050  0.2069594  0.003697884
[2,] -3.11255759 3.2906927 1.6343063 -0.3117967 -0.5064454 -0.550509833
         [,13]     [,14]       [,15]
[1,] 0.2633712 0.2551295  0.23294695
[2,] 0.2696215 0.2195266 -0.07654278

e) LRT for each marker

logl_H0 <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[2]]

for(j in 1:dim(geno)[2]){
    myList<-logistic.IRLS(Xa = geno[,j], Xd = NULL,Y=Y, beta.initial.vec = c(0,0))
    logl<-c(logl,myList[[2]])

}
LRT<-2*logl-2*logl_H0 #likelihood ratio test statistic

cat("LRT for each marker\n")
LRT for each marker
print(LRT)
 [1] 1.192766e+00 3.137203e-02 4.502196e-01 2.298408e-01 3.442340e+00
 [6] 9.230014e-03 1.433018e+02 1.493237e+02 7.427857e+01 2.956867e+00
[11] 9.418434e+00 8.585346e+00 2.545098e+00 1.680166e+00 1.710400e-01

f) p-values for each marker

cat("LRT for each marker\n")
LRT for each marker
pval <- pchisq(LRT, 2, lower.tail = F)
print(pval)
 [1] 5.508003e-01 9.844364e-01 7.984286e-01 8.914371e-01 1.788567e-01
 [6] 9.953956e-01 7.627998e-32 3.756313e-33 7.423592e-17 2.279946e-01
[11] 9.011829e-03 1.366834e-02 2.801167e-01 4.316746e-01 9.180348e-01
cat("Most significant hit #", which.min(pval),"=",pval[which.min(pval)],"\n")
Most significant hit # 8 = 3.756313e-33 

g) manhattan plot

plot(-log10(pval), main = "Manhattan plot")

h) QQ plots

expected_pvals <- sort(runif(length(pval)), decreasing = FALSE)
observed_pvals <- sort(pval, decreasing = FALSE)


plot(-log10(expected_pvals), -log10(observed_pvals), main = "QQplot for all genotypes")
abline( a= 0 , b = 1, col = "red")