These exercises cover the sections of Statistics in R Introduction to R.
Exercise 1
catAndExpr <- read.delim("../data/categoriesAndExpression.txt")
summary(catAndExpr)
## geneName ofInterest pathway Expression
## Gene1 : 1 NotSelected:80 Glycolysis:55 Min. :19.94
## Gene10 : 1 Selected :20 TGFb :45 1st Qu.:22.99
## Gene100: 1 Median :43.03
## Gene11 : 1 Mean :40.78
## Gene12 : 1 3rd Qu.:55.32
## Gene13 : 1 Max. :74.08
## (Other):94
summary(catAndExpr)
## geneName ofInterest pathway Expression
## Gene1 : 1 NotSelected:80 Glycolysis:55 Min. :19.94
## Gene10 : 1 Selected :20 TGFb :45 1st Qu.:22.99
## Gene100: 1 Median :43.03
## Gene11 : 1 Mean :40.78
## Gene12 : 1 3rd Qu.:55.32
## Gene13 : 1 Max. :74.08
## (Other):94
summary(catAndExpr[catAndExpr$pathway == "Glycolysis",])
## geneName ofInterest pathway Expression
## Gene1 : 1 NotSelected:40 Glycolysis:55 Min. :19.94
## Gene10 : 1 Selected :15 TGFb : 0 1st Qu.:21.01
## Gene11 : 1 Median :23.00
## Gene12 : 1 Mean :29.78
## Gene13 : 1 3rd Qu.:27.06
## Gene14 : 1 Max. :74.08
## (Other):49
summary(catAndExpr[catAndExpr$pathway == "TGFb",])
## geneName ofInterest pathway Expression
## Gene100: 1 NotSelected:40 Glycolysis: 0 Min. :42.91
## Gene16 : 1 Selected : 5 TGFb :45 1st Qu.:50.09
## Gene17 : 1 Median :54.04
## Gene18 : 1 Mean :54.24
## Gene19 : 1 3rd Qu.:58.06
## Gene20 : 1 Max. :66.10
## (Other):39
contingency <- ftable(catAndExpr[,c("ofInterest","pathway")])
contingency
## pathway Glycolysis TGFb
## ofInterest
## NotSelected 40 40
## Selected 15 5
#fisher.test(contingency)
meanExpression <- mean(catAndExpr$Expr)
sdExpression <- sd(catAndExpr$Expr)
Gene13Expression <- catAndExpr[catAndExpr$geneName == "Gene13",]$Expression
1-pnorm(Gene13Expression,meanExpression,sdExpression)
## [1] 0.0254759
glycolysisExpression <- catAndExpr[catAndExpr$pathway == "Glycolysis",]$Expression
tgfbExpression <- catAndExpr[catAndExpr$pathway == "TGFb",]$Expression
var.test(glycolysisExpression,tgfbExpression)
##
## F test to compare two variances
##
## data: glycolysisExpression and tgfbExpression
## F = 7.5607, num df = 54, denom df = 44, p-value = 2.375e-10
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 4.241664 13.253432
## sample estimates:
## ratio of variances
## 7.560727
t.test(glycolysisExpression,tgfbExpression,var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: glycolysisExpression and tgfbExpression
## t = -10.9975, df = 70.605, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.89282 -20.02306
## sample estimates:
## mean of x mean of y
## 29.77779 54.23573
Exercise 2
lmExercise <- read.delim("../data/lmExercise.txt")
lmXY <- lm(x~y,lmExercise)
lmXZ <- lm(x~z,lmExercise)
lmXY
##
## Call:
## lm(formula = x ~ y, data = lmExercise)
##
## Coefficients:
## (Intercept) y
## -10.7659 0.2461
plot(x~y,lmExercise)
abline(lmXY,col="red")
lmXZ
##
## Call:
## lm(formula = x ~ z, data = lmExercise)
##
## Coefficients:
## (Intercept) z
## 6.8754 -0.4654
plot(x~z,lmExercise)
abline(lmXZ,col="red")
str(summary(lmXY))
## List of 11
## $ call : language lm(formula = x ~ y, data = lmExercise)
## $ terms :Classes 'terms', 'formula' length 3 x ~ y
## .. ..- attr(*, "variables")= language list(x, y)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "x" "y"
## .. .. .. ..$ : chr "y"
## .. ..- attr(*, "term.labels")= chr "y"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(x, y)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
## $ residuals : Named num [1:1000] -0.00382 -0.76583 -0.57636 0.97523 -0.36146 ...
## ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
## $ coefficients : num [1:2, 1:4] -10.76587 0.24614 0.49211 0.00109 -21.8769 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "y"
## .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
## $ aliased : Named logi [1:2] FALSE FALSE
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "y"
## $ sigma : num 0.572
## $ df : int [1:3] 2 998 2
## $ r.squared : num 0.981
## $ adj.r.squared: num 0.981
## $ fstatistic : Named num [1:3] 50752 1 998
## ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
## $ cov.unscaled : num [1:2, 1:2] 7.41e-01 -1.64e-03 -1.64e-03 3.65e-06
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "y"
## .. ..$ : chr [1:2] "(Intercept)" "y"
## - attr(*, "class")= chr "summary.lm"
summary(lmXY)$r.squared
## [1] 0.9807148
summary(lmXZ)$r.squared
## [1] 0.9222796
str(lmXY)
## List of 12
## $ coefficients : Named num [1:2] -10.766 0.246
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "y"
## $ residuals : Named num [1:1000] -0.00382 -0.76583 -0.57636 0.97523 -0.36146 ...
## ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
## $ effects : Named num [1:1000] -3162.995 128.79 -0.618 0.983 -0.372 ...
## ..- attr(*, "names")= chr [1:1000] "(Intercept)" "y" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:1000] 104 96.7 107.4 98.8 102 ...
## ..- attr(*, "names")= chr [1:1000] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:1000, 1:2] -31.6228 0.0316 0.0316 0.0316 0.0316 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:1000] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "y"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.03 1.03
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 998
## $ xlevels : Named list()
## $ call : language lm(formula = x ~ y, data = lmExercise)
## $ terms :Classes 'terms', 'formula' length 3 x ~ y
## .. ..- attr(*, "variables")= language list(x, y)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "x" "y"
## .. .. .. ..$ : chr "y"
## .. ..- attr(*, "term.labels")= chr "y"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(x, y)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
## $ model :'data.frame': 1000 obs. of 2 variables:
## ..$ x: num [1:1000] 104 95.9 106.8 99.8 101.6 ...
## ..$ y: num [1:1000] 466 436 480 445 458 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 x ~ y
## .. .. ..- attr(*, "variables")= language list(x, y)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "x" "y"
## .. .. .. .. ..$ : chr "y"
## .. .. ..- attr(*, "term.labels")= chr "y"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(x, y)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "x" "y"
## - attr(*, "class")= chr "lm"
predictedXfromY <- unname(lmXY$coefficients[1] + lmXY$coefficients[2]*100)
predictedXfromZ <- unname(lmXZ$coefficients[1] + lmXZ$coefficients[2]*100)
predictedXfromY
## [1] 13.84769
predictedXfromZ
## [1] -39.66735