These exercises cover the sections of Statistics in R Introduction to R.

Exercise 1

 catAndExpr <- read.delim("../data/categoriesAndExpression.txt")
 summary(catAndExpr)
##     geneName        ofInterest       pathway     Expression   
##  Gene1  : 1   NotSelected:80   Glycolysis:55   Min.   :19.94  
##  Gene10 : 1   Selected   :20   TGFb      :45   1st Qu.:22.99  
##  Gene100: 1                                    Median :43.03  
##  Gene11 : 1                                    Mean   :40.78  
##  Gene12 : 1                                    3rd Qu.:55.32  
##  Gene13 : 1                                    Max.   :74.08  
##  (Other):94
 summary(catAndExpr)
##     geneName        ofInterest       pathway     Expression   
##  Gene1  : 1   NotSelected:80   Glycolysis:55   Min.   :19.94  
##  Gene10 : 1   Selected   :20   TGFb      :45   1st Qu.:22.99  
##  Gene100: 1                                    Median :43.03  
##  Gene11 : 1                                    Mean   :40.78  
##  Gene12 : 1                                    3rd Qu.:55.32  
##  Gene13 : 1                                    Max.   :74.08  
##  (Other):94
 summary(catAndExpr[catAndExpr$pathway == "Glycolysis",])
##     geneName        ofInterest       pathway     Expression   
##  Gene1  : 1   NotSelected:40   Glycolysis:55   Min.   :19.94  
##  Gene10 : 1   Selected   :15   TGFb      : 0   1st Qu.:21.01  
##  Gene11 : 1                                    Median :23.00  
##  Gene12 : 1                                    Mean   :29.78  
##  Gene13 : 1                                    3rd Qu.:27.06  
##  Gene14 : 1                                    Max.   :74.08  
##  (Other):49
 summary(catAndExpr[catAndExpr$pathway == "TGFb",])
##     geneName        ofInterest       pathway     Expression   
##  Gene100: 1   NotSelected:40   Glycolysis: 0   Min.   :42.91  
##  Gene16 : 1   Selected   : 5   TGFb      :45   1st Qu.:50.09  
##  Gene17 : 1                                    Median :54.04  
##  Gene18 : 1                                    Mean   :54.24  
##  Gene19 : 1                                    3rd Qu.:58.06  
##  Gene20 : 1                                    Max.   :66.10  
##  (Other):39
contingency <- ftable(catAndExpr[,c("ofInterest","pathway")])
contingency
##             pathway Glycolysis TGFb
## ofInterest                         
## NotSelected                 40   40
## Selected                    15    5
#fisher.test(contingency)
meanExpression <- mean(catAndExpr$Expr)
sdExpression <- sd(catAndExpr$Expr)
Gene13Expression <- catAndExpr[catAndExpr$geneName == "Gene13",]$Expression
1-pnorm(Gene13Expression,meanExpression,sdExpression)
## [1] 0.0254759
glycolysisExpression <- catAndExpr[catAndExpr$pathway == "Glycolysis",]$Expression
tgfbExpression <- catAndExpr[catAndExpr$pathway == "TGFb",]$Expression
var.test(glycolysisExpression,tgfbExpression)
## 
##  F test to compare two variances
## 
## data:  glycolysisExpression and tgfbExpression
## F = 7.5607, num df = 54, denom df = 44, p-value = 2.375e-10
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##   4.241664 13.253432
## sample estimates:
## ratio of variances 
##           7.560727
t.test(glycolysisExpression,tgfbExpression,var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  glycolysisExpression and tgfbExpression
## t = -10.9975, df = 70.605, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -28.89282 -20.02306
## sample estimates:
## mean of x mean of y 
##  29.77779  54.23573

Exercise 2

 lmExercise <- read.delim("../data/lmExercise.txt")
 lmXY <- lm(x~y,lmExercise)
 lmXZ <- lm(x~z,lmExercise)
lmXY
## 
## Call:
## lm(formula = x ~ y, data = lmExercise)
## 
## Coefficients:
## (Intercept)            y  
##    -10.7659       0.2461
 plot(x~y,lmExercise)
 abline(lmXY,col="red")

lmXZ
## 
## Call:
## lm(formula = x ~ z, data = lmExercise)
## 
## Coefficients:
## (Intercept)            z  
##      6.8754      -0.4654
plot(x~z,lmExercise)
 abline(lmXZ,col="red")

str(summary(lmXY))
## List of 11
##  $ call         : language lm(formula = x ~ y, data = lmExercise)
##  $ terms        :Classes 'terms', 'formula' length 3 x ~ y
##   .. ..- attr(*, "variables")= language list(x, y)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "x" "y"
##   .. .. .. ..$ : chr "y"
##   .. ..- attr(*, "term.labels")= chr "y"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(x, y)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
##  $ residuals    : Named num [1:1000] -0.00382 -0.76583 -0.57636 0.97523 -0.36146 ...
##   ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
##  $ coefficients : num [1:2, 1:4] -10.76587 0.24614 0.49211 0.00109 -21.8769 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "(Intercept)" "y"
##   .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
##  $ aliased      : Named logi [1:2] FALSE FALSE
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "y"
##  $ sigma        : num 0.572
##  $ df           : int [1:3] 2 998 2
##  $ r.squared    : num 0.981
##  $ adj.r.squared: num 0.981
##  $ fstatistic   : Named num [1:3] 50752 1 998
##   ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
##  $ cov.unscaled : num [1:2, 1:2] 7.41e-01 -1.64e-03 -1.64e-03 3.65e-06
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "(Intercept)" "y"
##   .. ..$ : chr [1:2] "(Intercept)" "y"
##  - attr(*, "class")= chr "summary.lm"
summary(lmXY)$r.squared
## [1] 0.9807148
summary(lmXZ)$r.squared
## [1] 0.9222796
str(lmXY)
## List of 12
##  $ coefficients : Named num [1:2] -10.766 0.246
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "y"
##  $ residuals    : Named num [1:1000] -0.00382 -0.76583 -0.57636 0.97523 -0.36146 ...
##   ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1000] -3162.995 128.79 -0.618 0.983 -0.372 ...
##   ..- attr(*, "names")= chr [1:1000] "(Intercept)" "y" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:1000] 104 96.7 107.4 98.8 102 ...
##   ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:1000, 1:2] -31.6228 0.0316 0.0316 0.0316 0.0316 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1000] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "y"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.03 1.03
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 998
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = x ~ y, data = lmExercise)
##  $ terms        :Classes 'terms', 'formula' length 3 x ~ y
##   .. ..- attr(*, "variables")= language list(x, y)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "x" "y"
##   .. .. .. ..$ : chr "y"
##   .. ..- attr(*, "term.labels")= chr "y"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(x, y)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
##  $ model        :'data.frame':   1000 obs. of  2 variables:
##   ..$ x: num [1:1000] 104 95.9 106.8 99.8 101.6 ...
##   ..$ y: num [1:1000] 466 436 480 445 458 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 x ~ y
##   .. .. ..- attr(*, "variables")= language list(x, y)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "x" "y"
##   .. .. .. .. ..$ : chr "y"
##   .. .. ..- attr(*, "term.labels")= chr "y"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(x, y)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
##  - attr(*, "class")= chr "lm"
predictedXfromY <- unname(lmXY$coefficients[1] + lmXY$coefficients[2]*100)
predictedXfromZ <- unname(lmXZ$coefficients[1] + lmXZ$coefficients[2]*100)
predictedXfromY
## [1] 13.84769
predictedXfromZ
## [1] -39.66735