Quantitative Genomics and Genetics 2016

Final R-code Key


– 23 May 2016

– Author: Jin Hyun Ju (jj328@cornell.edu)

Problem 1

a)

pheno_norm <- read.csv("QG16_Final_Y_normal.csv", row.names =1)

hist(pheno_norm$normal_pheno, main = "histogram of normal phenotype")

c)

pheno_cat <- read.csv("QG16_Final_Y_categorical.csv", row.names =1)

hist(pheno_cat$binary_pheno, main = "histogram of categorical phenotype")

Problem 2

a)

genotypes <- read.csv("QG16_Final_Genotypes.csv", row.names =1)

maf_calc <- function(single_geno){
    n_geno <- length(single_geno)
    allele_count <- sum(single_geno)
    allele_freq <- allele_count / (2*n_geno)
    if(allele_freq > 0.5){
        allele_freq <- 1- allele_freq  
    }
    return(allele_freq)
}

maf_vec = apply(genotypes,2, maf_calc)

hist(maf_vec, main = "MAF of all genotypes")

b)

geno_pca <- prcomp(genotypes)

plot(geno_pca$x[,1], geno_pca$x[,2], main = "Genotype PC projections", xlab = "PC1", ylab = "PC2")

Problem 3

a)

xa_matrix <- genotypes - 1
xd_matrix <- 1 - 2*abs(xa_matrix)

library(MASS)

pval_calculator <- function(pheno_input, xa_input, xd_input){
    n_samples <- length(xa_input)

    X_mx <- cbind(1,xa_input,xd_input)

    MLE_beta <- ginv(t(X_mx) %*% X_mx) %*% t(X_mx) %*% pheno_input
    y_hat <- X_mx %*% MLE_beta

    SSM <- sum((y_hat - mean(pheno_input))^2)
    SSE <- sum((pheno_input - y_hat)^2)

    df_M <- 2
    df_E <- n_samples - 3 

    MSM <- SSM / df_M
    MSE <- SSE / df_E

    Fstatistic <- MSM / MSE

    # to check if it is correct 
    pval <- pf(Fstatistic, df_M, df_E,lower.tail = FALSE)

    return(pval)
}

n_geno <- ncol(xa_matrix)

pval_vec <-vector(length =n_geno)

for (i in 1 : n_geno){

    #cat("Testing Gentoype = ", i, "\n")      
    pval_vec[i] <- pval_calculator(pheno_norm$normal_pheno, xa_input = xa_matrix[,i], xd_input = xd_matrix[,i])

}

b)

plot(1:n_geno, -log10(pval_vec),main = "Problem 3 manhattan plot") 

c)

sorted_pvals <- sort(pval_vec, decreasing = FALSE)
sorted_expected <- sort(runif(n_geno), decreasing = FALSE)

plot(-log10(sorted_expected), -log10(sorted_pvals), main = "Problem 3 QQplot")
abline(a = 0, b = 1, col = "red")