Quantitative Genomics and Genetics

Computer Lab 6

– 2 October 2014

– Author: Jin Hyun Ju (jj328@cornell.edu)


Simulating genotypes


nucleotides <- sample(c("a","t","g","c"), 2,replace = FALSE)
N <- 10

sim.genotypes <- matrix(NA, nrow = N, ncol = 2)

for( i in 1:N){
  sim.genotypes[i,] <- sample(nucleotides, 2, replace = TRUE)
}

Exercise 1

  • Note that this would create genotypes with a minor allele frequency (MAF) of 0.5 since that is the default. So we would like to change that! The first task for you would be to create genotypes with MAF = 0.3 for 10 samples.

  • Now from the created sample calculate the MAF and see if it is close to the value that we have set.

  • The table() function might come in very handy.

  • Now that we have a simulated set of genotypes, we can code them into the dummy variables Xa and Xd that you have seen in class.

  • Xa takes values -1,0,1 and Xd takes -1 or 1

plot of chunk unnamed-chunk-6

  • For Xa conversion we could do something like this
sim.genotypes
      [,1] [,2]
 [1,] "c"  "c" 
 [2,] "g"  "g" 
 [3,] "g"  "g" 
 [4,] "c"  "c" 
 [5,] "g"  "c" 
 [6,] "c"  "c" 
 [7,] "g"  "g" 
 [8,] "c"  "g" 
 [9,] "c"  "g" 
[10,] "c"  "c" 
minor.allele <- names(which.min(table(sim.genotypes)))

temp <- 1 * (sim.genotypes == minor.allele)
Xa <- temp[,1] + temp[,2] - 1 
 
# Checking if the conversion is correct
cbind(sim.genotypes, Xa)
              Xa  
 [1,] "c" "c" "-1"
 [2,] "g" "g" "1" 
 [3,] "g" "g" "1" 
 [4,] "c" "c" "-1"
 [5,] "g" "c" "0" 
 [6,] "c" "c" "-1"
 [7,] "g" "g" "1" 
 [8,] "c" "g" "0" 
 [9,] "c" "g" "0" 
[10,] "c" "c" "-1"

Exercise 2.

  • Create an Xd matrix for the simulated genotypes.

Simulating Phenotypes


  • Now that we have an Xa and Xd matrix we can simulate phenotypes as well

  • Matrix calculations will make this process a lot easier.

Exercise 3

  • Lets say for now that we have \(\beta_{\mu} = 0.3\), \(\beta_a = 1.5\), \(\beta_d = 0.05\) and \(\epsilon \sim N(0,1)\). Using these parameters simulate phenotypes following the model: \(Y = \beta_{\mu} + X_a \beta_a + X_d \beta_d + \epsilon\).

  • Here are some equations that might come in handy for this task.

plot of chunk unnamed-chunk-9

# Note that you have to add a column with 1's to the Xa and Xd matrix for the multiplication of beta_mu
X.mx <- cbind(1, Xa,Xd)
  • After the simulation the results should look something like this :
par(mfrow = c(1,2))
plot(Xa, phenotypes, main = "Xa vs Phenotypes", pch = 19, col = "blue")
plot(Xd, phenotypes, main = "Xd vs Phenotypes", pch = 19, col = "blue")

plot of chunk unnamed-chunk-12

Exercise 4.

  • Using the equations given above, calculate the F statistic for the simulated data

  • As a last step, use the F-statistic and the corresponding degrees of freedom to calculate a p value using the pf() function with option lower.tail = FALSE.

# The pseudo code for this task

MLE.beta <- calculate MLE.beta 
y.hat <- calculate the estimated values of y given the MLE>beta values

SSM <- calculate SSM
SSE <- calculate SSE

df.M
df.E

MSM <- calculate MSM
MSE <- calculate MSE

Fstatistic <- MSM / MSE

pf(Fstatistic, df.M,df.E,lower.tail =FALSE)

Exercise 5

  • Now that we have all the essentials worked out, we can simulate genotypes for multiple samples and generate a manhattan plot from it.

  • You can review scatter plots from last weeks lab to create manhattan plots.

N <- Set number of samples
n.geno <- Set number of genotypes
sim.genotypes <- declare matrix with appropriate dimensions (N x (2*n.geno)) 

# for the number of genotypes
for( i in 1:n.geno){
  nucleotides <- sample two nucleotides for the position
  maf <- get a random maf from a uniform distribution (note that 0 < maf =< 0.5)
  
  # for j in 1 to number of samples
  for (j in 1:N){

    sim.genotypes[j,c(2*i-1,2*i)] <- Generate two genotypes for each sample and use the maf as probability
  }
  
}


Xa <- create a place holding matrix for Xa values


For all genotypes
for ( i in 1: n.geno){
  
  # get minor allele
  minor.allele <- names(which.min(table(sim.genotypes[,c(2*i -1, 2*i)])))

  temp <- 1 * (sim.genotypes[,c(2*i -1, 2*i)] == minor.allele)
  Xa[,i] <- temp[,1] + temp[,2] - 1 
  
}

Xd <-  convert Xa into Xd


# Lets make genotype # 20 the significant one

X.mx <- cbind(1, Xa[,20],Xd[,20])
beta.vec <- c(0.3,0.8,0.05)
phenotypes <- X.mx %*% beta.vec + rnorm(10,0,1)


pval <- define vector to save p-values

for (i in 1 : n.geno){
  
  single.Xa <- get single Xa 
  single.Xd <- get single Xd
  
  X.mx <- define X.mx using single Xa and single Xd
  
  MLE.beta <- calculate MLE.beta 
  y.hat <- calculate the estimated values of y given the MLE>beta values

  SSM <- calculate SSM
  SSE <- calculate SSE

  df.M
  df.E

  MSM <- calculate MSM
  MSE <- calculate MSE

  Fstatistic <- MSM / MSE

  pval[i] <- pf(Fstatistic, df.M,df.E,lower.tail =FALSE)

}
  • Your Manhattan plot should look something like this:

plot of chunk unnamed-chunk-17