## Quantitative Genomics and Genetics

### HW5 Key (R codes)

â€“ 10 November 2014

### Problem 2

1. Simulate Genotypes
# Load the necessary packages

library(MASS)      # for ginv()

library(ggplot2)   # for ggplot()

set.seed(1500)

N <- 100

nucleotides <- c("A1","A2")

sim.genotypes <- matrix(NA, nrow = N, ncol = 2)

MAF <- 0.3

for( i in 1:N){

sim.genotypes[i,] <- sample(nucleotides, 2, replace = TRUE, prob = c(MAF,1-MAF))

}

minor.allele <- "A1"

temp <- 1 * (sim.genotypes == minor.allele)

Xa.mx <- as.matrix(temp[,1] + temp[,2] - 1 )

Xd.mx <- 1 - (2 * abs(Xa.mx))
1. Simulate two phenotypes
beta1 <- c(1,0,0)

beta2 <- c(1,1,-1)

X.mx <- cbind(1,Xa.mx,Xd.mx)

k.null <- X.mx%*%beta1

k.alt <- X.mx%*%beta2

expected.y1 <- exp(k.null) / (1+ exp(k.null))

expected.y2 <- exp(k.alt) / (1+ exp(k.alt))

error1 <- rbinom(n = N,size=1,prob = expected.y1) - expected.y1

error2 <- rbinom(n = N,size=1,prob = expected.y2) - expected.y2

y1 <- expected.y1+error1

y2 <- expected.y2+error2
1. Plots
plot.df <- data.frame( "Y1" = y1, "Y2"= y2, "Xa" = Xa.mx, "Xd" = Xd.mx)

jitter.plot1 <- ggplot(plot.df, aes(x = Xa, y = Y1))+ geom_point(position = position_jitter(w = 0.1, h = 0.1)) + ggtitle("Xa vs Phenotype 1")

jitter.plot2 <- ggplot(plot.df, aes(x = Xd, y = Y1))+ geom_point(position = position_jitter(w = 0.1, h = 0.1)) + ggtitle("Xd vs Phenotype 1")

jitter.plot3 <- ggplot(plot.df, aes(x = Xa, y = Y2))+ geom_point(position = position_jitter(w = 0.1, h = 0.1)) + ggtitle("Xa vs Phenotype 2")

jitter.plot4 <- ggplot(plot.df, aes(x = Xd, y = Y2))+ geom_point(position = position_jitter(w = 0.1, h = 0.1)) + ggtitle("Xd vs Phenotype 1")

print(jitter.plot1)

print(jitter.plot2)

print(jitter.plot3)

print(jitter.plot4)

1. MLE with IRLS (Alternative)
logistic.IRLS<- function(Xa,Xd,Y = Y, beta.initial.vec = c(0,0,0), d.stop.th = 1e-6, it.max = 100) {

#Create the X matrix

X.mx <- cbind(rep(1,dim(Y)[1]), Xa, Xd)

Y <- as.matrix(Y)

N <- dim(X.mx)[1]

#check this matrix:

#initialize the beta parameter vector at t=0

beta.t <- beta.initial.vec

# initialize deviance at d[t]

dt<-0

#initialize gamma

# K is the part which goes into the exponent

K<-X.mx %*% beta.t

gamma_inv<-exp(K)/(1+exp(K))

for(i in 1:it.max) {

dpt1<-dt #store previous deviance

# create empty matrix W

W<-matrix(0,N,N)

# Fill in the diagonal of W with appropriate values

for(j in 1:N){

W[j,j] <- gamma_inv[j] * (1-gamma_inv[j])

}

beta.t <- beta.t + ginv(t(X.mx)%*%W%*%X.mx)%*%t(X.mx)%*%(Y-gamma_inv)

#update gamma since it's a function of beta

K <- X.mx %*% beta.t

gamma_inv <- exp(K)/(1+exp(K))

#calculate new deviance

dt <- 2*( sum(Y[Y==1]*log(Y[Y==1]/gamma_inv[Y==1])) + sum((1-Y[Y==0])*log((1-Y[Y==0])/(1-gamma_inv[Y==0]))) )

absD <- abs(dt - dpt1)

if(absD < d.stop.th) {

cat("Convergence at iteration:", i, "at threshold:", d.stop.th, "\n")

logl<-sum(Y*log(gamma_inv)+(1-Y)*log(1-gamma_inv))

return(list(beta.t,logl))

}

}

cat("Convergence not reached after iteration:", i, "at threshold:", d.stop.th, "\n")

return(list(beta.t= c(NA,NA,NA),logl=NA))

}

y1.IRLS.result <- logistic.IRLS(Xa = Xa.mx,Xd = Xd.mx, Y=y1)
Convergence at iteration: 4 at threshold: 1e-06 
y2.IRLS.result <- logistic.IRLS(Xa = Xa.mx,Xd = Xd.mx, Y=y2)
Convergence at iteration: 5 at threshold: 1e-06 
beta1 - y1.IRLS.result[[1]]
           [,1]

[1,] -0.1350392

[2,] -0.0942003

[3,]  0.2748379
beta2 - y2.IRLS.result[[1]]
           [,1]

[1,] 0.44685524

[2,] 0.30014131

[3,] 0.04958164
1. MLE with IRLS (Null)
y1.null.result <- logistic.IRLS(Xa = NULL,Xd = NULL, Y=y1, beta.initial.vec = c(0))
Convergence at iteration: 4 at threshold: 1e-06 
y2.null.result <- logistic.IRLS(Xa = NULL,Xd = NULL, Y=y2, beta.initial.vec = c(0))
Convergence at iteration: 3 at threshold: 1e-06 
beta1[1] - y1.null.result[[1]]
           [,1]

[1,] -0.1526795
beta2[1] - y2.null.result[[1]]
          [,1]

[1,] 0.5526878
1. LRT calculations
log0.y1 <- y1.null.result[[2]]

log0.y2 <- y2.null.result[[2]]

LRT.y1 <- 2*(y1.IRLS.result[[2]]-log0.y1)

LRT.y2 <- 2*(y2.IRLS.result[[2]]-log0.y2)

LRT.y1
[1] 1.08897
LRT.y2
[1] 15.48612
1. P-value Calculations
pval.y1 <- pchisq(LRT.y1, 2, lower.tail = FALSE)

pval.y2 <- pchisq(LRT.y2, 2, lower.tail = FALSE)

pval.y1
[1] 0.5801405
pval.y2
[1] 0.0004337419