The ASTA team
## Girth Height Volume
## 1 8.3 70 10
## 2 8.6 65 10
## 3 8.8 63 10
## 4 10.5 72 16
## 5 10.7 81 19
## 6 10.8 83 20
## [1] 0.94
\[ \operatorname{Volume} = \alpha + \beta_{1}(\operatorname{Girth}) + \epsilon \]
\[ \operatorname{\widehat{Volume}} = -36.94 + 5.07(\operatorname{Girth}) \]
\(R^2\) and correlation:
## [1] 0.94
## [1] 0.96
## [1] 0.98
## [1] 0.94
## [1] 0.96
## [1] 0.98
Mean squared error (MSE):
## [1] 17
## [1] 10
## [1] 4.9
## [1] 1.847 0.044 0.801
## [1] -1.3 1.3 -1.5
## [1] 0.46 -0.71 -0.70
## [1] -0.63 0.18 -0.84
## [1] -0.63 0.18 -0.84
## [1] 31
set.seed(1)
train_idx <- sample(x = seq_len(nrow(trees)), size = 20, replace = FALSE)
trees_train <- trees[train_idx, ]
nrow(trees_train)
## [1] 20
## [1] 11
Using trees_train
for training:
Using trees_test
for testing (“out-of-sample” error):
## [1] 0.98
## [1] 0.97
## [1] 0.016
## [1] 40
## [1] 17
## [1] 2074
## [1] 31
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
Divide data into folds:
“Rotate” which folds are training data and which one is test data:
https://cran.r-project.org/package=caret https://topepo.github.io/caret/
Setting up cross validation:
## Linear Regression
##
## 31 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 10 times)
## Summary of sample sizes: 24, 23, 23, 23, 23, 23, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.5 0.95 3.7
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
## RMSE Rsquared MAE Resample
## 1 4.7 0.91 4.2 Fold1.Rep01
## 2 4.2 0.91 3.6 Fold2.Rep01
## 3 5.3 0.99 3.3 Fold3.Rep01
## 4 4.7 0.96 4.2 Fold4.Rep01
## 5 3.8 0.93 3.2 Fold1.Rep02
## 6 5.6 0.93 4.6 Fold2.Rep02
## 7 2.7 0.97 2.0 Fold3.Rep02
## 8 5.0 0.95 4.7 Fold4.Rep02
## 9 3.5 0.97 2.7 Fold1.Rep03
## 10 4.7 0.93 4.2 Fold2.Rep03
## 11 4.6 0.96 3.6 Fold3.Rep03
## 12 5.2 0.96 4.2 Fold4.Rep03
## 13 4.4 0.93 3.6 Fold1.Rep04
## 14 5.3 0.95 4.2 Fold2.Rep04
## 15 3.5 0.96 2.7 Fold3.Rep04
## 16 4.1 0.95 3.4 Fold4.Rep04
## 17 3.7 0.98 3.0 Fold1.Rep05
## 18 4.4 0.92 3.7 Fold2.Rep05
## 19 6.2 0.98 4.6 Fold3.Rep05
## 20 4.8 0.92 4.2 Fold4.Rep05
## 21 5.8 0.96 4.4 Fold1.Rep06
## 22 4.8 0.92 4.2 Fold2.Rep06
## 23 3.7 0.93 3.1 Fold3.Rep06
## 24 3.0 0.96 2.5 Fold4.Rep06
## 25 5.8 0.95 4.8 Fold1.Rep07
## 26 3.5 0.94 2.8 Fold2.Rep07
## 27 3.8 0.92 3.0 Fold3.Rep07
## 28 4.3 0.97 3.9 Fold4.Rep07
## 29 5.6 0.98 3.8 Fold1.Rep08
## 30 4.5 0.93 4.4 Fold2.Rep08
## 31 5.0 0.94 4.2 Fold3.Rep08
## 32 3.8 0.97 2.8 Fold4.Rep08
## 33 3.8 0.97 3.3 Fold1.Rep09
## 34 4.7 0.94 4.2 Fold2.Rep09
## 35 4.2 0.96 3.7 Fold3.Rep09
## 36 5.3 0.95 3.8 Fold4.Rep09
## 37 3.5 0.97 3.1 Fold1.Rep10
## 38 4.7 0.90 4.1 Fold2.Rep10
## 39 4.2 0.90 3.3 Fold3.Rep10
## 40 4.9 0.96 3.5 Fold4.Rep10
## [1] 4.5
## [1] 4.5
Extra (with tidyverse
, remember library(tidyverse)
):
d <- bind_rows(
m0_cv$resample |> mutate(Model = "Volume ~ Girth"),
m1_cv$resample |> mutate(Model = "Volume ~ poly(Girth, 2)")
)
d_mean <- d |>
group_by(Model) |>
summarise(RMSE = mean(RMSE), .groups = "drop")
ggplot(d, aes(Model, RMSE)) +
geom_boxplot() +
geom_jitter(height = 0, width = 0.1) +
geom_point(data = d_mean, color = "red", shape = 4, size = 5)
## (Intercept) Girth
## -36.9 5.1
Preparing bootstrap:
model_coef <- function(index){
coef(lm(Volume ~ Girth, data = trees, subset = index))
}
model_coef(1:nrow(trees))
## (Intercept) Girth
## -36.9 5.1
## (Intercept) Girth
## -30.6 4.7
## (Intercept) Girth
## -29.8 4.5
## (Intercept) Girth
## -34.4 4.8
set.seed(1)
bootstrap_coefs <- replicate(1000, {
model_coef(sample(1:nrow(trees), replace = TRUE))
})
bootstrap_coefs[, 1:10]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## (Intercept) -29.8 -34.4 -34.3 -42.6 -36 -35.0 -36.5 -39.4 -34.0 -32.1
## Girth 4.5 4.8 4.8 5.5 5 4.9 5.1 5.2 4.8 4.7
## [1] 2 1000
## (Intercept) Girth
## 3.98 0.32
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.9 3.37 -11 7.6e-12
## Girth 5.1 0.25 20 8.6e-19
gf_point(Volume ~ Girth, data = trees) %>%
gf_abline(intercept = coef(m0)[1], slope = coef(m0)[2], color = "red") %>%
gf_abline(intercept = bootstrap_coefs[1, 1], slope = bootstrap_coefs[2, 1], color = "black") %>%
gf_abline(intercept = bootstrap_coefs[1, 2], slope = bootstrap_coefs[2, 2], color = "black")
Also see the boot
package: https://cran.r-project.org/package=boot
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = trees, statistic = model_coef_boot, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -36.9 0.372 4.05
## t2* 5.1 -0.038 0.33
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.9 3.37 -11 7.6e-12
## Girth 5.1 0.25 20 8.6e-19
Resampling residuals with replacement:
set.seed(1)
parbootstrap_coefs <- replicate(1000, {
new_y <- m0$fitted.values + sample(m0$residuals, replace = TRUE)
coef(lm(new_y ~ trees$Girth))
})
parbootstrap_coefs[, 1:10]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## (Intercept) -38.4 -38.5 -34.0 -36 -35 -31.7 -38.6 -40.8 -30.5 -38.8
## trees$Girth 5.2 5.1 4.8 5 5 4.6 5.2 5.3 4.6 5.2
## (Intercept) trees$Girth
## 3.31 0.24
## (Intercept) Girth
## 3.98 0.32
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.9 3.37 -11 7.6e-12
## Girth 5.1 0.25 20 8.6e-19