Quantitative Genomics and Genetics

Midterm Key (R codes)

– 12 October 2014

Problem 1

    1. Phenotype histogram
phenotype <- read.table("./QG14_phenotypes_midterm.txt", header = F)

# If you skip the conversion of the phenotype data frame into a matrix R will complain
phenotype <- as.matrix(phenotype)

hist(phenotype, main = "Histogram of Phenotype", col = "skyblue")

plot of chunk unnamed-chunk-1

Problem 2

    1. Calculate the minor allele frequency (MAF) for each SNP and plot a histogram
byrow.geno <- read.table("./QG14_genotypes_midterm.txt", sep = ",", header = T)

# Same with the genotypes, in order to do matrix calculations you have to convert the data frame into a matrix
byrow.geno <- as.matrix(byrow.geno)

# get the number of genotypes and get the sample size (N)
n.geno <- dim(byrow.geno)[2]
N <- dim(phenotype)[1]


# function to calculate the maf for each genotype
get_maf <- function(geno.mx){
  maf <- min(table(geno.mx)) / sum(table(geno.mx))
  return(maf)
}

# declare maf.vec to hold MAF values 
maf.vec <- vector(length = n.geno)
for ( i in 1: (n.geno)){
  maf.vec[i] <- get_maf(byrow.geno[,i]) 
}

# Plot a histgram of MAF values 
hist(maf.vec, main ="MAF histogram", col = "skyblue")

plot of chunk unnamed-chunk-2

    1. Remove all SNPs that have an MAF < 0.06 and plot a new histgoram of the MAFs that remain
filtered.geno <- byrow.geno[,which(maf.vec >= 0.06)]

# Plot a new histogram for MAF > 0.06
hist(maf.vec[which(maf.vec >= 0.06)], main ="MAF histogram", col = "skyblue")

plot of chunk unnamed-chunk-3

    1. How many SNPs are left?
remaining.geno <- sum(maf.vec >= 0.06)

cat("After filtering for MAF <0.06, ",remaining.geno, "genotypes are left \n")
After filtering for MAF <0.06,  2856 genotypes are left 

Problem 3

    1. calculate \(MLE(\hat{\beta})\) for each genotype and plot three histograms
# vectors for odd and even rows to get the genotypes into the form we used to work with  ( each column = single allele)
odd <- seq(1,dim(byrow.geno)[1],by =2)
even <- seq(2,dim(byrow.geno)[1], by = 2)



n.geno <- dim(filtered.geno)[2]

# define a matrix with dimensions N x (2* n.geno) for conversion
genotypes <- matrix (NA, nrow = N, ncol = n.geno*2)

# loop over remaining genotypes
# split the 600 rows into two columns which are next to each other 
# The resulting format will be 1 column = 1 allele / column 1,2 = genotype 1 column 3,4 = genotype 2 ... 
# this step is not necessary if you are comfortable handling the genotypes as they are given to you
for( i in 1: n.geno){
  genotypes[,2*i -1] <- as.character(filtered.geno[odd,i])
  genotypes[,2*i] <- as.character(filtered.geno[even,i])
}

# Xa conversion
Xa <- matrix(NA, nrow = N, ncol = n.geno)
for ( i in 1: n.geno){
  minor.allele <- names(which.min(table(as.matrix(genotypes[,c(2*i -1,2*i)]))))
  
  temp <- 1 * (genotypes[,c(2*i -1, 2*i)] == minor.allele)
  Xa[,i] <- temp[,1] + temp[,2] - 1 
  
}

# Xd conversion
Xd <-  1 - 2*abs(Xa)


# Setting the column names to avoid confusion
colnames(Xa) <- colnames(filtered.geno)
colnames(Xd) <- colnames(filtered.geno)


MLE.beta <- matrix(NA, nrow = n.geno, ncol = 3)

for (i in 1 : n.geno){
  X.mx <- cbind(1,Xa[,i],Xd[,i])
  
  MLE.beta[i,] <- solve(t(X.mx) %*% X.mx) %*% t(X.mx) %*% as.matrix(phenotype)
  
}

par(mfrow = c(1,3))
hist(MLE.beta[,1], main= "Histogram for MLE(beta_mu)", col= "darkblue")
hist(MLE.beta[,2], main= "Histogram for MLE(beta_a)", col= "blue")
hist(MLE.beta[,3], main= "Histogram for MLE(beta_d)", col= "skyblue")

plot of chunk unnamed-chunk-5

    1. Calculate p-values for each genotype and generate a manhattan plot
pval <- vector(length = n.geno)

for (i in 1:n.geno) {
  X.mx <- cbind(1,Xa[,i],Xd[,i])
  y.hat <- X.mx %*% MLE.beta[i,]
  
  SSM <- sum((y.hat - mean(phenotype))^2)
  SSE <- sum((phenotype - y.hat)^2)
  
  df.M <- 2
  df.E <- N - 3 
  
  MSM <- SSM / df.M
  MSE <- SSE / df.E
  
  Fstatistic <- MSM / MSE 

  pval[i] <- pf(Fstatistic, df.M, df.E,lower.tail = FALSE)

}

plot(-log10(pval), main = "Example manhattan plot", col = "darkblue")

plot of chunk unnamed-chunk-6

Problem 4

    1. Plot a histogram of all P-values
hist(pval, main = "Histogram of P-values", col = "skyblue")

plot of chunk unnamed-chunk-7

Problem 5

    1. Cutoff for a = 0.05 after bonferroni correction
cutoff <- 0.05 / n.geno

cat("Bonferroni corrected cutoff = ", cutoff, "\n")
Bonferroni corrected cutoff =  1.751e-05 
    1. Identify singificant hits
# pair genotype names with their p-values
names(pval) <- colnames(Xa)

# Filter the p-values to only work with significant hits
sig.pvals <- which(pval < cutoff)

# Sort the significant p-values 
sort(pval[sig.pvals])
     G730     G1500      G731      G732      G737      G740      G168 
2.150e-11 2.150e-11 7.563e-11 7.563e-11 8.772e-09 8.772e-09 9.165e-09 
     G175      G166      G741      G743      G744      G745      G746 
1.882e-08 5.451e-08 2.721e-07 2.721e-07 2.721e-07 2.721e-07 2.721e-07 
     G165      G135      G138      G140      G141      G143      G139 
3.827e-06 6.089e-06 7.249e-06 7.249e-06 7.249e-06 7.249e-06 1.461e-05 
     G727 
1.581e-05 
cat("Total number of causal polymorphisms = 3 / significant markers = G730,G1500,G168")
Total number of causal polymorphisms = 3 / significant markers = G730,G1500,G168

Problem 6

    1. determine the p-value cutoff for FDR ~ 0.05
# Adjust the pvalues using the base functions p.adjust with the method set to FDR
# This will result in "adjusted p-values" which are multiplied by a certain number to correct for FDR
# For the bonferroni cut off we could have done
# bon.pvals <- p.adjust(pval, method = "bonferroni")
# bon.pvals[which(bon.pvals < 0.05 )]
FDR.pvals <- p.adjust(pval, method = "fdr")


# To find the actual FDR cuoff 

alpha <- seq(from = 0.00001,to = 0.05, by = 0.00001)
FDR <- vector(length = length(alpha))
for ( i in 1:length(alpha)){
  FDR[i] <- (length(pval) * alpha[i]) / sum(pval < alpha[i])
}
alpha[which.min(abs(FDR - 0.05))]
[1] 0.00086
    1. Number of hits using FDR < 0.05
# now we can simply ask which ones are below 0.05
sig.geno.pvals <- FDR.pvals[which(FDR.pvals < 0.05)]
sig.geno.idx <- names(sig.geno.pvals) 
sig.geno.num <- as.numeric(sub("G","",sig.geno.idx))
sort(sig.geno.pvals)
     G730     G1500      G731      G732      G168      G737      G740 
3.071e-08 3.071e-08 5.400e-08 5.400e-08 3.739e-06 3.739e-06 3.739e-06 
     G175      G166      G741      G743      G744      G745      G746 
6.718e-06 1.730e-05 5.551e-05 5.551e-05 5.551e-05 5.551e-05 5.551e-05 
     G165      G135      G138      G140      G141      G143      G139 
7.286e-04 1.035e-03 1.035e-03 1.035e-03 1.035e-03 1.035e-03 1.988e-03 
     G727     G2143      G122      G117     G2155     G2140      G221 
2.053e-03 2.178e-03 3.121e-03 4.148e-03 4.148e-03 4.555e-03 7.244e-03 
    G2112     G2120      G834     G2169     G2466     G2077       G85 
8.894e-03 8.894e-03 9.517e-03 2.071e-02 2.115e-02 3.041e-02 3.295e-02 
      G97      G102     G2250     G2253      G107      G807      G747 
3.295e-02 3.295e-02 3.387e-02 3.387e-02 3.420e-02 3.435e-02 3.608e-02 
     G748     G2160     G2203     G2168      G734      G271      G829 
3.608e-02 3.608e-02 3.608e-02 3.935e-02 4.472e-02 4.722e-02 4.893e-02 
#plot(sig.geno.num, -log10(sig.geno.pvals), main = "FDR significant hits", col ="darkblue")


cat("Total number of causal polymorphisms (FDR < 0.05) = 10 / significant markers = G730,G1500,G168, G2143,G221,G834,G2466,G2077,G85,G2250")
Total number of causal polymorphisms (FDR < 0.05) = 10 / significant markers = G730,G1500,G168, G2143,G221,G834,G2466,G2077,G85,G2250
cat("FDR corrected p-values","\n")
FDR corrected p-values 
sig.geno.pvals[c("G730","G1500","G168", "G2143","G221","G834","G2466","G2077","G85","G2250")]
     G730     G1500      G168     G2143      G221      G834     G2466 
3.071e-08 3.071e-08 3.739e-06 2.178e-03 7.244e-03 9.517e-03 2.115e-02 
    G2077       G85     G2250 
3.041e-02 3.295e-02 3.387e-02 
cat("Raw p-values","\n")
Raw p-values 
pval[c("G730","G1500","G168", "G2143","G221","G834","G2466","G2077","G85","G2250")]
     G730     G1500      G168     G2143      G221      G834     G2466 
2.150e-11 2.150e-11 9.165e-09 1.754e-05 7.102e-05 1.033e-04 2.444e-04 
    G2077       G85     G2250 
3.620e-04 4.253e-04 4.625e-04 

Problem 7

    1. Calculate correlation between the Xa values of the two most significant markers
sd1 <- sd(Xa[,"G730"])
sd2 <- sd(Xa[,"G1500"])

covariance.Xa <- (sum((Xa[,"G730"] - mean(Xa[,"G730"])) * (Xa[,"G1500"] - mean(Xa[,"G1500"])))/(N-1))
correlation.Xa <- covariance.Xa / (sd1 * sd2)

cat("Correlation between markers G730 and G 1500 = ", correlation.Xa,"\n")
Correlation between markers G730 and G 1500 =  1 

Problem 8

    1. X-Y plots for markers 168 and 1010
par(mfrow = c(2,2))
plot(Xa[,"G168"],phenotype,main = "Marker G168 Xa vs Y", col = "blue" )
plot(Xd[,"G168"],phenotype,main = "Marker G168 Xd vs Y", col = "blue" )

plot(Xa[,"G1010"],phenotype,main = "Marker G1010 Xa vs Y", col = "blue" )
plot(Xd[,"G1010"],phenotype,main = "Marker G1010 Xd vs Y", col = "blue" )