## Quantitative Genomics and Genetics

### Homework 4 Key (Problem 2)

– 12 October 2014

### 2a. Simulating genotypes

set.seed(1000)
#set number of samples
N <- 100

# set number of genotypes
n.geno <- 200

# create a matrix to hold the simulated genotype values
# note that each row is a sample and a column contains 1 allele of a specific genotypes, so the number of columns should be 2 * n.geno
sim.genotypes <- matrix(NA, nrow = N, ncol = 2 * n.geno)

# start for loop to create genotypes
for( i in 1:n.geno){

# I took the approach of choosing 2 nucleotides from all possible 4 nucleotides for a single genotype
nucleotides <- sample(c("a","t","g","c"), 2,replace = FALSE)

# randomly assign a minor allele frequency for that specific genotype which has a minimum value of 0.2 and a maximum of 0.5 ( if its greater than 0.5 it is not the minor allele anymore)
maf <- runif(1,min = 0.2,max =0.5)

# for the number of samples
for (j in 1:N){
# sample a pair of genotypes from the 2 nucleotides selected for this genotype for each sample
# note that replace = TRUE to allow homozygous pairs and prob specifies the MAF
# also note that 2*i-1, 2*i are specifying a pair of columns (ex, 12 34 56 78)
sim.genotypes[j,c(2*i-1,2*i)] <- sample(nucleotides, 2, replace = TRUE, prob = c(maf,1-maf))
}

}

### 2b. Xa and Xd conversion

# declare a matrix to hold the Xa values
# note the dimensions
Xa <- matrix(NA, nrow = N, ncol = n.geno)

# for each genotype
for ( i in 1: n.geno){

# get the minor allele
minor.allele <- names(which.min(table(sim.genotypes[,c(2*i -1, 2*i)])))

# convert all minor alleles to 1 (major alleles will have a value of 0)
temp <- 1 * (sim.genotypes[,c(2*i -1, 2*i)] == minor.allele)

# Count the number of minor alleles in each row (0,1,2) and substract 1 from it to convert it to (-1,0,1)
Xa[,i] <- temp[,1] + temp[,2] - 1

}

# conversion of the Xa matrix to the Xd matrix
Xd <-  1 - 2*abs(Xa)

### 2c. Simulating phenotypes

# We are going to use genotypes 25 and 150 as the significant ones to simulate the phenotype
sig.genotype.mx <- cbind(1, Xa[,25],Xd[,25],Xa[,150],Xd[,150])

# the beta values are given in the problem
beta.vec <- c(beta.mu=1,beta.a1=1.0,beta.d1=0,beta.a2=0.5,beta.d2=0.5)
# the standard deviation for the normaly distributed random error
error.sd <- 1

# creating a 100 x 1 phenotype
phenotypes <- sig.genotype.mx %*% beta.vec + rnorm(100,mean = 0,sd =error.sd)

### 2d.

The true model for genotypes is $$\beta_a = 0$$ and $$\beta_d = 0$$, such that they do not contribute at all to variation in the phenotype (i.e. they are uncorrelated with the phenotype), such that we can simulate the phenotype when only considering the impact of the causal genotypes and there associate parameter values.

### 2e. MLE calculations

# create a matrix to hold the MLE values for each genotype

MLE.mx=matrix(NA,nrow=3,ncol=n.geno)

# Loop over every genotype to calculate the MLE values
for (i in 1:n.geno){
X.mx=cbind(1, Xa[,i],Xd[,i])
MLE.mx[,i]=solve(t(X.mx)%*%X.mx)%*%t(X.mx)%*%phenotypes
}

par(mfrow = c(1,3))
plot(1:dim(MLE.mx)[2],MLE.mx[1,],main = "MLE(beta.mu) for every Genotype",xlab ="genotype idx",ylab="MLE(beta.mu)")
plot(1:dim(MLE.mx)[2],MLE.mx[2,],main = "MLE(beta.a) for every Genotype",xlab ="genotype idx",ylab="MLE(beta.a)")
plot(1:dim(MLE.mx)[2],MLE.mx[3,],main = "MLE(beta.d) for every Genotype",xlab ="genotype idx",ylab="MLE(beta.d)")

### 2f. F-statistic calculation and plotting

fstat <- vector(length = n.geno)
for (i in 1 : n.geno){

N.samples <- N
X.mx <- cbind(1,Xa[,i],Xd[,i])

MLE.beta <- solve(t(X.mx) %*% X.mx) %*% t(X.mx) %*% phenotypes
y.hat <- X.mx %*% MLE.beta

SSM <- sum((y.hat - mean(phenotypes))^2)
SSE <- sum((phenotypes - y.hat)^2)

df.M <- 2
df.E <- N - 3

MSM <- SSM / df.M
MSE <- SSE / df.E

fstat[i] <- MSM / MSE

}

hist(fstat, breaks = 20, main = "Histogram of F statistics for each genotype")

### 2g. P-value calculation and manhattan plot generation

# vectorized p-value calculation (no need for a loop)
pvals <- pf(fstat, df.M, df.E, lower.tail = FALSE)

# plotting the manhattan plot
plot(1:n.geno,-log10(pvals),xlab ="",ylab="-Log10(P-values)",main = "Manhattan Plot", col = "blue", pch =19)

### 2h.

These Manhattan plots have individual genotypes that are highly significant where none of the genotypes around them are close to the same significance level. This means that there is no linkage disequilibrium (or at least disequilibrium among genotypes close to each other) that you simulated. (NOTE that if you simulated markers that were in LD, your answer should be slightly different - and also fine!).

### 2i. Null hyptothesis rejection

Yes we would reject for the two significant cases at a type I error of 0.05. You should have had around 10 (maybe a few more or less) that were significant at 0.05. It is not surprising that we had additional genotypes that were significant (=false positives!) since setting a type 1 error of 0.05 means that we expect 1 out of 20 tests where the null hypothesis is correct will result in the (incorrect!) rejection of the null hypothesis. Since 200 * 0.05 = 10, we expect to incorrectly to reject the null for 10 of the markers.

cat(paste("P-value for genotype # 25 =",pvals[25]))
P-value for genotype # 25 = 7.9617877954859e-09
if(pvals[25] < 0.05) {
cat("Null is rejected \n")
}
Null is rejected 
cat(paste("P-value for genotype # 150 =",pvals[150]))
P-value for genotype # 150 = 8.25969937802909e-06
if(pvals[150] < 0.05) {
cat("Null is rejected \n")
}
Null is rejected 
cat(paste("The total number of cases where we reject the null hypothesis with a threshold of 0.05 = ",sum(pvals<0.05)))
The total number of cases where we reject the null hypothesis with a threshold of 0.05 =  15

### 2j. X-Y plots

par(mfrow = c(2,2))

plot(Xa[,25],phenotypes,xlab="Genotype = Xa[,25]",ylab="Phenotype")
plot(Xd[,25],phenotypes,xlab="Genotype = Xd[,25]",ylab="Phenotype")
plot(Xa[,150],phenotypes,xlab="Genotype = Xa[,150]",ylab="Phenotype")
plot(Xd[,150],phenotypes,xlab="Genotype = Xd[,150]",ylab="Phenotype")