Quantitative Genomics and Genetics

Final Key (R codes)

– 9 December 2014

Problem 1

    1. Calculate the Minor Allele Frequencies of the genotypes, filter out genotypes that have MAF < 0.05. Plot histogram
genotype.df <- read.table("~/Dropbox/Quantitative_Genomics_2014/Final/QG14_genotypes_final.ped", stringsAsFactors = FALSE, colClasses = c("character"))

genotype.info <- read.table("./QG14_genotypes_final.map", stringsAsFactors = FALSE, colClasses = c("character"))
rownames(genotype.info) <- genotype.info$V2 # set the rownames to genotype ids
colnames(genotype.info) <- c("Chr","ID","0","Position")

genotype.only <- genotype.df[,-c(1:6)]  # remove columns without genotypes

rownames(genotype.only) <- genotype.df$V1 # set rownames to sample IDs

n.samples <- dim(genotype.only)[1]
n.geno <- dim(genotype.only)[2] /2 # get number of genotypes
MAF.vec <- vector(length = n.geno) # define a vector to hold MAF values

for ( i in 1: n.geno){
  
    MAF.vec[i] <- min(table(as.matrix(genotype.only[,c((2*i -1),2*i)]))) / sum(table(as.matrix(genotype.only[,c((2*i -1),2*i)])))
    
}

names(MAF.vec) <- genotype.info$ID # label the genotypes to avoid confusion
maf.low <- which(MAF.vec < 0.05)   # get indexes for genotypes to filter

MAF.filtered <- MAF.vec[-maf.low]  # remove low MAF genotypes

hist(MAF.filtered , main = "Histogram of MAF after filtering out < 0.05", col = "skyblue")

# additional processing to create Xa and Xd matrix with filtered genotypes

n.geno.filter <- length(MAF.filtered)

Xa <- matrix(NA, nrow = n.samples, ncol = n.geno)
colnames(Xa) <- names(MAF.vec)
# Xa conversion

# covert all genotypes first and then filter (For convenience)
for ( i in 1: n.geno){

  minor.allele <- names(which.min(table(as.matrix(genotype.only[,c(2*i -1,2*i)]))))
  
  temp <- 1 * (genotype.only[,c(2*i -1, 2*i)] == minor.allele)
  Xa[,i] <- temp[,1] + temp[,2] - 1 
  
}

Xa <- Xa[,-maf.low]   # filter out genotypes with low MAF

Xd <- 1 - 2 * abs(Xa)
    1. How many SNPs remain after filtering?
cat("Original number of genotypes = ", n.geno,"\n")
## Original number of genotypes =  1745
cat("Number of genotypes with MAF < 0.05 = ", length(maf.low), "\n")
## Number of genotypes with MAF < 0.05 =  249
cat("Number of genotypes left after filtering = ", n.geno.filter,"\n")
## Number of genotypes left after filtering =  1496

Problem 2

    1. Calculate the minor allele frequency (MAF) for each SNP and plot a histogram
phenotype1 <- read.table("./QG14_phenotypes1_final.txt", stringsAsFactors = FALSE)

hist(as.matrix(phenotype1), main = "Histogram of Phenotype 1", col = "skyblue")

    1. Calculate p-values for filtered genotypes
phenotype1 <- as.matrix(phenotype1)
pval1 <- vector(length = n.geno.filter)

for (i in 1 : n.geno.filter){
  
  N.samples <- dim(Xa)[1]
  X.mx <- cbind(1,Xa[,i],Xd[,i])
  
  MLE.beta <- solve(t(X.mx) %*% X.mx) %*% t(X.mx) %*% phenotype1
  y.hat <- X.mx %*% MLE.beta
  
  SSM <- sum((y.hat - mean(phenotype1))^2)
  SSE <- sum((phenotype1 - y.hat)^2)
  
  df.M <- 2
  df.E <- N.samples - 3 
  
  MSM <- SSM / df.M
  MSE <- SSE / df.E
  
  Fstatistic <- MSM / MSE 
  
  pval1[i] <- pf(Fstatistic, df.M, df.E,lower.tail = FALSE)
  
}
    1. QQ-plot for the p-values
theoretical.pvals <- runif(n= n.geno.filter,0,1)

theoretical.pvals <- sort(-log10(theoretical.pvals))
observed.pvals <- sort(-log10(pval1))


plot(theoretical.pvals,observed.pvals, main = "P value QQplot")
abline(a = 0,b = 1, col = "red")

Problem 3

    1. Bonferroni correction
threshold <- 0.05 / n.geno.filter
cat("Bonferroni cutoff for < 0.05 = ", threshold, "\n")
Bonferroni cutoff for < 0.05 =  3.342246e-05 
    1. How many separate peaks did you observe?
plot(-log10(pval1), main = "Example manhattan plot", col = "darkblue")
abline(h = -log10(threshold), col = "red")

    1. Number of genotype
names(pval1) <- colnames(Xa)
sig.pvals1 <- pval1[which(pval1 < threshold)]

data.frame(genotype.info[names(sig.pvals1),],"pvals" =sig.pvals1)
          Chr        ID X0 Position        pvals
rs1867353  22 rs1867353  0 16739243 1.514161e-06
rs2165971  22 rs2165971  0 16744579 1.553209e-07
rs5992895  22 rs5992895  0 16761686 1.433948e-05
rs2283825  22 rs2283825  0 25254535 2.607829e-05
rs7291918  22 rs7291918  0 30338039 2.885365e-05
rs131254   22  rs131254  0 30461022 1.144637e-06
rs5998114  22 rs5998114  0 30481214 7.405195e-08

Problem 4

    1. Plot a histogram Phenotype2
phenotype2 <- read.table("./QG14_phenotypes2_final.txt")

hist(as.matrix(phenotype2), main = "Histogram of Phenotype2", col = "skyblue")

  • (b)
library(MASS)

logistic.IRLS<- function(Xa,Xd,Y =Y, beta.initial.vec = c(0,0,0), d.stop.th = 1e-6, it.max = 100) {
  
  #Create the X matrix
  X.mx <- cbind(rep(1,dim(Y)[1]), Xa, Xd)
  
  N <- dim(X.mx)[1]
  #check this matrix:
  #initialize the beta parameter vector at t=0
  beta.t <- beta.initial.vec
  
  # initialize deviance at d[t]
  dt<-0
  
  #initialize gamma
  # K is the part which goes into the exponent
  K<-X.mx %*% beta.t
  gamma_inv<-exp(K)/(1+exp(K))
  
  for(i in 1:it.max) {
    dpt1<-dt #store previous deviance
    
    # create empty matrix W
    W<-matrix(0,N,N)
    
    # Fill in the diagonal of W with appropriate values
    for(j in 1:N){
      W[j,j] <- gamma_inv[j] * (1-gamma_inv[j])
    }
    
    
    beta.t <- beta.t + ginv(t(X.mx)%*%W%*%X.mx)%*%t(X.mx)%*%(Y-gamma_inv)
    
    #update gamma since it's a function of beta
    K <- X.mx %*% beta.t
    gamma_inv <- exp(K)/(1+exp(K))
    
    #calculate new deviance
    dt <- 2*( sum(Y[Y==1]*log(Y[Y==1]/gamma_inv[Y==1])) + sum((1-Y[Y==0])*log((1-Y[Y==0])/(1-gamma_inv[Y==0]))) )
    
    absD <- abs(dt - dpt1)
    
    if(absD < d.stop.th) {
      #     cat("Convergence at iteration:", i, "at threshold:", d.stop.th, "\n")
      logl<-sum(Y*log(gamma_inv)+(1-Y)*log(1-gamma_inv))
      return(list(beta.t,logl))
    }   
  }
  # cat("Convergence not reached after iteration:", i, "at threshold:", d.stop.th, "\n")
  return(list(beta.t= c(NA,NA,NA),logl=NA))
}



Y <- as.matrix(phenotype2)

beta<-NULL
logl_HA<-NULL
for(j in 1:dim(Xa)[2]){
  myList<-logistic.IRLS(Xa[,j],Xd[,j],Y=Y)
  beta<-cbind(beta,myList[[1]])
  logl_HA<-c(logl_HA,myList[[2]])
  # cat("Locus ",j,"'s beta values: ",myList[[1]],"\n")
}


# Log likelihood for null hypothesis
logl_H0 <- logistic.IRLS(Y=Y, Xa= NULL, Xd=NULL, beta.initial.vec = c(0))[[2]]

LRT<-2*logl_HA-2*logl_H0 #likelihood ratio test statistic

pval2 <- pchisq(LRT, 2, lower.tail = F)
    1. QQ-plot for the p-values
theoretical.pvals <- runif(n= n.geno.filter,0,1)

theoretical.pvals <- sort(-log10(theoretical.pvals))
observed.pvals <- sort(-log10(pval2))


plot(theoretical.pvals,observed.pvals, main = "P value QQplot")
abline(a = 0,b = 1, col = "red")