## Quantitative Genomics and Genetics

### Final Key (R codes)

â€“ 9 December 2014

### Problem 1

1. Calculate the Minor Allele Frequencies of the genotypes, filter out genotypes that have MAF < 0.05. Plot histogram
``````genotype.df <- read.table("~/Dropbox/Quantitative_Genomics_2014/Final/QG14_genotypes_final.ped", stringsAsFactors = FALSE, colClasses = c("character"))

genotype.info <- read.table("./QG14_genotypes_final.map", stringsAsFactors = FALSE, colClasses = c("character"))
rownames(genotype.info) <- genotype.info\$V2 # set the rownames to genotype ids
colnames(genotype.info) <- c("Chr","ID","0","Position")

genotype.only <- genotype.df[,-c(1:6)]  # remove columns without genotypes

rownames(genotype.only) <- genotype.df\$V1 # set rownames to sample IDs

n.samples <- dim(genotype.only)[1]
n.geno <- dim(genotype.only)[2] /2 # get number of genotypes
MAF.vec <- vector(length = n.geno) # define a vector to hold MAF values

for ( i in 1: n.geno){

MAF.vec[i] <- min(table(as.matrix(genotype.only[,c((2*i -1),2*i)]))) / sum(table(as.matrix(genotype.only[,c((2*i -1),2*i)])))

}

names(MAF.vec) <- genotype.info\$ID # label the genotypes to avoid confusion
maf.low <- which(MAF.vec < 0.05)   # get indexes for genotypes to filter

MAF.filtered <- MAF.vec[-maf.low]  # remove low MAF genotypes

hist(MAF.filtered , main = "Histogram of MAF after filtering out < 0.05", col = "skyblue")``````

``````# additional processing to create Xa and Xd matrix with filtered genotypes

n.geno.filter <- length(MAF.filtered)

Xa <- matrix(NA, nrow = n.samples, ncol = n.geno)
colnames(Xa) <- names(MAF.vec)
# Xa conversion

# covert all genotypes first and then filter (For convenience)
for ( i in 1: n.geno){

minor.allele <- names(which.min(table(as.matrix(genotype.only[,c(2*i -1,2*i)]))))

temp <- 1 * (genotype.only[,c(2*i -1, 2*i)] == minor.allele)
Xa[,i] <- temp[,1] + temp[,2] - 1

}

Xa <- Xa[,-maf.low]   # filter out genotypes with low MAF

Xd <- 1 - 2 * abs(Xa)``````
1. How many SNPs remain after filtering?
``cat("Original number of genotypes = ", n.geno,"\n")``
``## Original number of genotypes =  1745``
``cat("Number of genotypes with MAF < 0.05 = ", length(maf.low), "\n")``
``## Number of genotypes with MAF < 0.05 =  249``
``cat("Number of genotypes left after filtering = ", n.geno.filter,"\n")``
``## Number of genotypes left after filtering =  1496``

### Problem 2

1. Calculate the minor allele frequency (MAF) for each SNP and plot a histogram
``````phenotype1 <- read.table("./QG14_phenotypes1_final.txt", stringsAsFactors = FALSE)

hist(as.matrix(phenotype1), main = "Histogram of Phenotype 1", col = "skyblue")``````

1. Calculate p-values for filtered genotypes
``````phenotype1 <- as.matrix(phenotype1)
pval1 <- vector(length = n.geno.filter)

for (i in 1 : n.geno.filter){

N.samples <- dim(Xa)[1]
X.mx <- cbind(1,Xa[,i],Xd[,i])

MLE.beta <- solve(t(X.mx) %*% X.mx) %*% t(X.mx) %*% phenotype1
y.hat <- X.mx %*% MLE.beta

SSM <- sum((y.hat - mean(phenotype1))^2)
SSE <- sum((phenotype1 - y.hat)^2)

df.M <- 2
df.E <- N.samples - 3

MSM <- SSM / df.M
MSE <- SSE / df.E

Fstatistic <- MSM / MSE

pval1[i] <- pf(Fstatistic, df.M, df.E,lower.tail = FALSE)

}``````
1. QQ-plot for the p-values
``````theoretical.pvals <- runif(n= n.geno.filter,0,1)

theoretical.pvals <- sort(-log10(theoretical.pvals))
observed.pvals <- sort(-log10(pval1))

plot(theoretical.pvals,observed.pvals, main = "P value QQplot")
abline(a = 0,b = 1, col = "red")``````

### Problem 3

1. Bonferroni correction
``````threshold <- 0.05 / n.geno.filter
cat("Bonferroni cutoff for < 0.05 = ", threshold, "\n")``````
``Bonferroni cutoff for < 0.05 =  3.342246e-05 ``
1. How many separate peaks did you observe?
``````plot(-log10(pval1), main = "Example manhattan plot", col = "darkblue")
abline(h = -log10(threshold), col = "red")``````

1. Number of genotype
``````names(pval1) <- colnames(Xa)
sig.pvals1 <- pval1[which(pval1 < threshold)]

data.frame(genotype.info[names(sig.pvals1),],"pvals" =sig.pvals1)``````
``````          Chr        ID X0 Position        pvals
rs1867353  22 rs1867353  0 16739243 1.514161e-06
rs2165971  22 rs2165971  0 16744579 1.553209e-07
rs5992895  22 rs5992895  0 16761686 1.433948e-05
rs2283825  22 rs2283825  0 25254535 2.607829e-05
rs7291918  22 rs7291918  0 30338039 2.885365e-05
rs131254   22  rs131254  0 30461022 1.144637e-06
rs5998114  22 rs5998114  0 30481214 7.405195e-08``````

### Problem 4

1. Plot a histogram Phenotype2
``````phenotype2 <- read.table("./QG14_phenotypes2_final.txt")

hist(as.matrix(phenotype2), main = "Histogram of Phenotype2", col = "skyblue")``````

• (b)
``````library(MASS)

logistic.IRLS<- function(Xa,Xd,Y =Y, beta.initial.vec = c(0,0,0), d.stop.th = 1e-6, it.max = 100) {

#Create the X matrix
X.mx <- cbind(rep(1,dim(Y)[1]), Xa, Xd)

N <- dim(X.mx)[1]
#check this matrix:
#initialize the beta parameter vector at t=0
beta.t <- beta.initial.vec

# initialize deviance at d[t]
dt<-0

#initialize gamma
# K is the part which goes into the exponent
K<-X.mx %*% beta.t
gamma_inv<-exp(K)/(1+exp(K))

for(i in 1:it.max) {
dpt1<-dt #store previous deviance

# create empty matrix W
W<-matrix(0,N,N)

# Fill in the diagonal of W with appropriate values
for(j in 1:N){
W[j,j] <- gamma_inv[j] * (1-gamma_inv[j])
}

beta.t <- beta.t + ginv(t(X.mx)%*%W%*%X.mx)%*%t(X.mx)%*%(Y-gamma_inv)

#update gamma since it's a function of beta
K <- X.mx %*% beta.t
gamma_inv <- exp(K)/(1+exp(K))

#calculate new deviance
dt <- 2*( sum(Y[Y==1]*log(Y[Y==1]/gamma_inv[Y==1])) + sum((1-Y[Y==0])*log((1-Y[Y==0])/(1-gamma_inv[Y==0]))) )

absD <- abs(dt - dpt1)

if(absD < d.stop.th) {
#     cat("Convergence at iteration:", i, "at threshold:", d.stop.th, "\n")
logl<-sum(Y*log(gamma_inv)+(1-Y)*log(1-gamma_inv))
return(list(beta.t,logl))
}
}
# cat("Convergence not reached after iteration:", i, "at threshold:", d.stop.th, "\n")
return(list(beta.t= c(NA,NA,NA),logl=NA))
}

Y <- as.matrix(phenotype2)

beta<-NULL
logl_HA<-NULL
for(j in 1:dim(Xa)[2]){
myList<-logistic.IRLS(Xa[,j],Xd[,j],Y=Y)
beta<-cbind(beta,myList[[1]])
logl_HA<-c(logl_HA,myList[[2]])
# cat("Locus ",j,"'s beta values: ",myList[[1]],"\n")
}

# Log likelihood for null hypothesis
logl_H0 <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[2]]

LRT<-2*logl_HA-2*logl_H0 #likelihood ratio test statistic

pval2 <- pchisq(LRT, 2, lower.tail = F)``````
1. QQ-plot for the p-values
``````theoretical.pvals <- runif(n= n.geno.filter,0,1)

theoretical.pvals <- sort(-log10(theoretical.pvals))
observed.pvals <- sort(-log10(pval2))

plot(theoretical.pvals,observed.pvals, main = "P value QQplot")
abline(a = 0,b = 1, col = "red")``````