## Quantitative Genomics and Genetics

### Computer Lab 2

– 4 September 2014

– Author: Jin Hyun Ju (jj328@cornell.edu)

### 1. Review: Don’t forget to set your working directory

getwd() # check current working directory 
[1] "/Users/Jin/Dropbox/Quantitative_Genomics_2014/Computer_lab_2"
setwd("~/Dropbox/Quantitative_Genomics_2014/Computer_lab_2/") # In case of windows "C:\Dropbox\..."

dir()    # check what is in your directory
[1] "QG13-lab2.pdf"            "QG14_Computer_Lab_2.html"
[3] "QG14_Computer_Lab_2.Rmd"  "QG14_subset_only_a.csv"  
dir.create("~/Dropbox/Quantitative_Genomics_2014/Computer_lab_2/test_dir") # In case you forgot to create a directory

dir()
[1] "QG13-lab2.pdf"            "QG14_Computer_Lab_2.html"
[3] "QG14_Computer_Lab_2.Rmd"  "QG14_subset_only_a.csv"
[5] "test_dir"                

### 2. Review: Reading and writing data

QG14.lab.2 <- read.table("~/Dropbox/Quantitative_Genomics_2014/Computer_lab_1/QG14-lab1-data.csv", sep = ",", header = T) # Reading data from a csv file

QG14.lab.2.only.a <- subset(QG14.lab.2, factor1 == "a") # subsetting a data frame

write.table(QG14.lab.2.only.a, file = "./QG14_subset_only_a.csv", sep = ",", quote= FALSE, row.names=FALSE)
# the quote options remove the "" of the entries. try it with quote = TRUE and see how it is different.
# row.names = FALSE eliminates the numbers in front of each row

### 3. Review: Vectors, Matrices, and Dataframes

# Declaring vectors
example.vector1 <- seq(from=1, to = 12, by = 3)

# Declaring matrices
example.matrix1 <- matrix(1:6,nrow=2)
example.matrix2 <- matrix(1:6,ncol = 2)
example.matrix3 <- matrix(1:6, nrow=2,ncol=3)
example.matrix4 <- matrix(1:6, nrow=2,ncol=3, byrow=TRUE)

# Checking the length of a vector
length(example.vector1)
[1] 4
# Accessing specific positions of a vector
example.vector1[2]
[1] 4
# Checking the dimensions of a matrix
dim(example.matrix4)
[1] 2 3
# Naming the rows and columns of a matrix
rownames(example.matrix4) <- c("row1","row2")
colnames(example.matrix4) <- c("column1","column2","column3")

# Accessing specific positions of a matrix
example.matrix4[2,1]
[1] 4
# Getting rows and columns of a matrix
example.matrix4[1,]
column1 column2 column3
1       2       3 
example.matrix4[,2]
row1 row2
2    5 
example.matrix4["row1",]
column1 column2 column3
1       2       3 
example.matrix4[,"column2"]
row1 row2
2    5 
#transposing a matrix
t(example.matrix4) 
        row1 row2
column1    1    4
column2    2    5
column3    3    6
example.matrix4
     column1 column2 column3
row1       1       2       3
row2       4       5       6
# Dataframes
numbers <- c(1:4)
characters <- c("a","b","c","d")

example.data.frame <- data.frame("col1"=numbers,"col2"=characters)
example.data.frame
  col1 col2
1    1    a
2    2    b
3    3    c
4    4    d
class(example.data.frame[,1])
[1] "integer"
class(example.data.frame[,2])
[1] "factor"
example.data.frame[1,1]
[1] 1
example.data.frame[2,]
  col1 col2
2    2    b
example.data.frame$col1 [1] 1 2 3 4 # Directly converting matrices to data frames converted.data.frame <- as.data.frame(example.matrix4) converted.data.frame  column1 column2 column3 row1 1 2 3 row2 4 5 6 ### 4. Concept of functions • Every expression in R with parentheses “( )” is a function • Just like we learned from this weeks lecture a functions takes in an input and gives you an output • The input is the part between “( )”, and the output the result running the function # Examples of built in functions mean(example.vector1) # a function that calculates the mean [1] 5.5 table(QG14.lab.2$factor1) # a function that summarizes the counts

a  b
50 50 
• We can also build custom functions
log10_add <- function(input1,input2){   # the syntax for declaring functions, note the {} after function()
# all the inputs are specified within the ( )
cat("This is a custom function \n")
cat("The inputs are = ",input1,input2,"\n")  # showing you the inputs
output = log10(input1) + log10(input2)       # creating an output within the function
cat("The output is = ",output,"\n")          # print the output
return(output)                               # return specifies the output
}

# Now we can call our custom functions like this
log10_add(100,1000)
This is a custom function
The inputs are =  100 1000
The output is =  5 
[1] 5
# Note that the variable output is not created in our workspace
ls()
 [1] "characters"           "converted.data.frame" "example.data.frame"
[4] "example.matrix1"      "example.matrix2"      "example.matrix3"
[13] "QG14.lab.2.only.a"   
# in order to save the result of a function to a variable we have to assign it to a variable
test.output <- log10_add(100,1000)
This is a custom function
The inputs are =  100 1000
The output is =  5 
test.output
[1] 5
• We can also load functions from published packages if somebody else did the hard work for us.
# We can install packages that are on CRAN by using this function

#install.packages("dplyr") # in this case we are installing a package called "dplyr"
# I had to comment this line out

# you will need an internet connection to install a package using this function

# If we try to use a function before loading the package
# R will complain that there is no such function

#mutate(QG14.lab.2,mean = (data1+data2+data3)/3)
# I had to comment this part out due to errors

# Once the installation is finished you can load the functions into the workspace
library(dplyr)

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

filter, lag

The following objects are masked from 'package:base':

intersect, setdiff, setequal, union
# or by
require(dplyr)

# Now we can use the functions from dplyr
QG14.lab.2 <- mutate(QG14.lab.2,mean = (data1+data2+data3)/3)
head(QG14.lab.2)
  genename    data1    data2   data3 factor1 factor2     mean
1    gene1  1.42866 -0.15785  1.3136       a   info1  0.86148
2    gene2 -0.58165  0.59400 -0.2319       b   info2 -0.07319
3    gene3 -1.03956  1.08386  0.7051       a   info3  0.24979
4    gene4  0.58382 -0.12587  1.2628       b   info4  0.57358
5    gene5  0.04377 -0.00224  0.1445       a   info5  0.06202
6    gene6  0.26733 -1.75629  0.5459       b   info6 -0.31435
filter(QG14.lab.2,factor1 == "b" & factor2 =="info8") # Note the & operator which is an AND operator
   genename   data1   data2   data3 factor1 factor2     mean
1     gene8  0.5110  1.3690  0.9549       b   info8  0.94495
2    gene18  1.0240  0.1331  0.9028       b   info8  0.68661
3    gene28  0.8517 -1.0032 -1.2987       b   info8 -0.48339
4    gene38  0.2831 -0.6795  1.8192       b   info8  0.47427
5    gene48 -1.1049  0.5410  0.3203       b   info8 -0.08124
6    gene58  0.7904  0.5097 -1.6116       b   info8 -0.10384
7    gene68 -1.4592  1.5071  0.1016       b   info8  0.04985
8    gene78  1.1133  0.7143 -0.2153       b   info8  0.53743
9    gene88 -1.0000  1.2684 -0.9385       b   info8 -0.22333
10   gene98  1.4705  0.8852  1.2724       b   info8  1.20937

### 5. Vector and Matrix calculations

• If you want to modify each element of a vector by a scalar value you can use the math operations that we have learned last week.
example.vector1
[1]  1  4  7 10
2 * example.vector1
[1]  2  8 14 20
1+ example.vector1
[1]  2  5  8 11
example.vector1 ^2
[1]   1  16  49 100
• If you are interested in the dot product of two vectors you have to use a special operator
example.vector1 %*% example.vector1
     [,1]
[1,]  166
• The same applies for matrices
2 * example.matrix1
     [,1] [,2] [,3]
[1,]    2    6   10
[2,]    4    8   12
example.matrix1 ^2
     [,1] [,2] [,3]
[1,]    1    9   25
[2,]    4   16   36
example.matrix1 - 1
     [,1] [,2] [,3]
[1,]    0    2    4
[2,]    1    3    5
• Here is how you can do matrix calculations
# t() is transposing the matrix
example.matrix1 %*% t(example.matrix1)
     [,1] [,2]
[1,]   35   44
[2,]   44   56
# Note the dimensions 2 x 3 %*% 3 x 2  = 2 x 2 
• Here are some useful functions that can be used in matrix calculations
# creating a diagonal matrix with the first input as values on the diagonal
diag(2,nrow = 3)
     [,1] [,2] [,3]
[1,]    2    0    0
[2,]    0    2    0
[3,]    0    0    2
diag(example.vector1)
     [,1] [,2] [,3] [,4]
[1,]    1    0    0    0
[2,]    0    4    0    0
[3,]    0    0    7    0
[4,]    0    0    0   10
# calculating the inverse of a matrix
A <- matrix(c(2,-3,1,0.5),nrow = 2)
solve(A)
      [,1]  [,2]
[1,] 0.125 -0.25
[2,] 0.750  0.50
# we can check this by
A %*% solve(A) # which results in an identity matrix 
     [,1] [,2]
[1,]    1    0
[2,]    0    1