Quantitative Genomics and Genetics 2016

Midterm R-code Key


– 4 April 2016

– Author: Jin Hyun Ju (jj328@cornell.edu)

Problem 3

a)

pop_mean <- 1.5
pop_var <- 1.3

M <- 10000
N <- 20 


sampling_function<- function(number_of_samples = 1000,
                             sample_size = 100,
                             input_mean = 0,
                             input_var = 1){
  
    sample_matrix <- matrix(NA, nrow = number_of_samples, ncol = sample_size)
    
    for ( i in 1:number_of_samples ){
        sample_matrix[i,] <- rnorm(sample_size,
                                   mean = input_mean, 
                                   sd = sqrt(input_var))
    }
    return(sample_matrix)
  
}
    
likelihood_normal <- function(input_mean, input_sd, x){
      temp = 1
      for ( i in x){
          temp = temp * (1/ sqrt( 2 * pi * input_sd^2)) * exp( - ((i - input_mean) ^ 2) / (2 * input_sd^2))
      }
      return(temp)
}

sample_a <- sampling_function(number_of_samples = M, sample_size = N, input_mean = 1, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_a)){
  x <- sample_a[i,]
  sd_mle <- sqrt(mean((x-mean(x))^2))
  like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
  like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
  like_ratio <- -2 * log(like_null / like_alt)
  lrt_vector <- c(lrt_vector, like_ratio)
  
}

hist(lrt_vector, main = "Problem 3 (a)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")
-2ln values greater than 3.841459 =  489 
pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)

b)

N <- 100
sample_b <- sampling_function(number_of_samples = M, sample_size = N, input_mean = 1, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_b)){
  x <- sample_b[i,]
  sd_mle <- sqrt(mean((x-mean(x))^2))
  like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
  like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
  like_ratio <- -2 * log(like_null / like_alt)
  lrt_vector <- c(lrt_vector, like_ratio)
  
}

hist(lrt_vector, main = "Problem 3 (b)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")
-2ln values greater than 3.841459 =  471 
pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)

c)

N <- 20

sample_c <- sampling_function(number_of_samples = M, sample_size = N, input_mean = pop_mean, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_c)){
  x <- sample_c[i,]
  sd_mle <- sqrt(mean((x-mean(x))^2))
  like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
  like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
  like_ratio <- -2 * log(like_null / like_alt)
  lrt_vector <- c(lrt_vector, like_ratio)
  
}

hist(lrt_vector, main = "Problem 3 (c)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")
-2ln values greater than 3.841459 =  4990 
pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)

d)

N <- 100

sample_d <- sampling_function(number_of_samples = M, sample_size = N, input_mean = pop_mean, input_var = 1.3)

lrt_vector <- c()

for( i in 1:nrow(sample_d)){
  x <- sample_d[i,]
  sd_mle <- sqrt(mean((x-mean(x))^2))
  like_null <- likelihood_normal(input_mean = 1, input_sd = sqrt(1.3), x = x)
  like_alt <- likelihood_normal(input_mean = mean(x), input_sd = sqrt(1.3), x = x)
  like_ratio <- -2 * log(like_null / like_alt)
  lrt_vector <- c(lrt_vector, like_ratio)
  
}

hist(lrt_vector, main = "Problem 3 (d)")

cat("-2ln values greater than 3.841459 = ", sum(lrt_vector > 3.841459), "\n")
-2ln values greater than 3.841459 =  9915 
pvals <- pchisq(lrt_vector, 1, lower.tail = FALSE)

hist(pvals)

Problem 6

phenotypes <- read.table("QG16_phenotypes_midterm.txt", stringsAsFactors = FALSE, row.names = 1)
midterm_genotypes <- read.table("QG16_genotypes_midterm.txt", stringsAsFactors = FALSE)

phenotypes <- as.matrix(phenotypes)

hist(phenotypes, main = "Histogram of Midterm Phenotypes")

outlier_pheno <- which.max(phenotypes)
cat("Outlier phenotypes = ", rownames(phenotypes)[outlier_pheno], "\n")
Outlier phenotypes =  sample_222 
filtered_phenotypes <- phenotypes[-outlier_pheno, ]

filtered_genotypes <- midterm_genotypes[-outlier_pheno,]

hist(filtered_phenotypes, main = "Phenotypes after outlier removal")

Problem 7

cat("Identifying individuals with missing genotype data\n")
Identifying individuals with missing genotype data
which(filtered_genotypes == 9, arr.ind = TRUE)
           row  col
sample_67   67 1167
sample_110 110 1820
sample_180 180 3540
sample_230 229 5100
missing_geno_names <- colnames(filtered_genotypes)[(which(filtered_genotypes == 9, arr.ind = TRUE)[,2])]
missing_geno <- sapply(strsplit(missing_geno_names, "_"), function(x) x[1])

cat("Genotypes with missing data: \n")
Genotypes with missing data: 
print(missing_geno)
[1] "geno584"  "geno910"  "geno1770" "geno2550"
missing_idx <- c()

for( m in missing_geno){
  missing_idx <- c(missing_idx, grep(m, colnames(filtered_genotypes)))
}

no_missing_geno <- as.matrix(filtered_genotypes[,-missing_idx])

Problem 8

a)

library(MASS)

xa_converter <- function(geno_in){
  geno_count <- table(geno_in)
  minor_allele <- names(geno_count[geno_count == min(geno_count)])
  xa_code <- ifelse(geno_in == minor_allele, 1,0)
  xa_result <- rowSums(xa_code) - 1
  return(xa_result)
}

MLE_calculator <- function(pheno_input, xa_input, xd_input){
    n_samples <- length(xa_input)

    X_mx <- cbind(1,xa_input,xd_input)
    
    MLE_beta <- ginv(t(X_mx) %*% X_mx) %*% t(X_mx) %*% pheno_input

    return(MLE_beta)
}

xa_matrix <- matrix(NA, 
                    nrow = nrow(no_missing_geno), 
                    ncol = ncol(no_missing_geno)/2)

colnames(xa_matrix) <- unique(sapply(strsplit(colnames(no_missing_geno), "_"), function(x) x[1]))

for (i in 1:(ncol(no_missing_geno)/2)){
  xa_matrix[,i] <- xa_converter(no_missing_geno[,c(2*i -1, 2*i)])  
}

xd_matrix <- 1 - (2 * abs(xa_matrix))


mle_matrix <- matrix(NA, nrow = ncol(xa_matrix), ncol = 3)

for(k in 1:ncol(xa_matrix)){
  
    mle_matrix[k,] <- MLE_calculator(filtered_phenotypes, xa_matrix[,k], xd_matrix[,k])
  
}

par(mfrow = c(1,3))
hist(mle_matrix[,1], main = "MLE beta_mu")
hist(mle_matrix[,2], main = "MLE beta_a")
hist(mle_matrix[,3], main = "MLE beta_d")