Resampling techniques

The ASTA team

Resampling techniques

Topics:

Model complexity

A linear model for tree data

trees <- read.delim("https://asta.math.aau.dk/datasets?file=trees.txt")
head(trees)
##   Girth Height Volume
## 1   8.3     70     10
## 2   8.6     65     10
## 3   8.8     63     10
## 4  10.5     72     16
## 5  10.7     81     19
## 6  10.8     83     20
m0 <- lm(Volume ~ Girth, data = trees)
plotModel(m0)

summary(m0)
## 
## Call:
## lm(formula = Volume ~ Girth, data = trees)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -36.943      3.365   -11.0  7.6e-12 ***
## Girth          5.066      0.247    20.5  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.2 on 29 degrees of freedom
## Multiple R-squared:  0.935,  Adjusted R-squared:  0.933 
## F-statistic:  419 on 1 and 29 DF,  p-value: <2e-16

A polynomial model

m1 <- lm(Volume ~ poly(Girth, 2), data = trees)
plotModel(m1)

summary(m1)
## 
## Call:
## lm(formula = Volume ~ poly(Girth, 2), data = trees)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.489 -2.429 -0.372  2.076  7.645 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       30.171      0.599   50.37  < 2e-16 ***
## poly(Girth, 2)1   87.073      3.335   26.11  < 2e-16 ***
## poly(Girth, 2)2   14.592      3.335    4.38  0.00015 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.3 on 28 degrees of freedom
## Multiple R-squared:  0.962,  Adjusted R-squared:  0.959 
## F-statistic:  350 on 2 and 28 DF,  p-value: <2e-16

Another polynomial model

m1_bad <- lm(Volume ~ poly(Girth, 7), data = trees)
plotModel(m1_bad)

A natural spline

library(splines)
m2 <- lm(Volume ~ ns(Girth, 15), data = trees)
plotModel(m2)

Measures of fit

\(R^2\) and correlation

summary(m0)$r.squared
## [1] 0.94
summary(m1)$r.squared
## [1] 0.96
summary(m2)$r.squared
## [1] 0.98
# or: m0$fitted
cor(trees$Volume,predict(m0, newdata = trees))^2
## [1] 0.94
cor(trees$Volume,predict(m1, newdata = trees))^2
## [1] 0.96
cor(trees$Volume,predict(m2, newdata = trees))^2
## [1] 0.98

Mean squared error

mean((trees$Volume - predict(m0, newdata = trees))^2)
## [1] 17
mean((trees$Volume - predict(m1, newdata = trees))^2)
## [1] 10
mean((trees$Volume - predict(m2, newdata = trees))^2)
## [1] 4.9

Out-of-sample error

Reproducibility and random number generation

rnorm(3)
## [1]  1.05  1.74 -0.68
rnorm(3)
## [1]  1.96 -0.59  0.72
rnorm(3)
## [1]  0.337  0.319 -0.077
set.seed(1)
rnorm(3)
## [1] -0.63  0.18 -0.84
set.seed(1)
rnorm(3)
## [1] -0.63  0.18 -0.84

Out-of-sample error

set.seed(1)
train_idx <- sample(x = seq_len(nrow(trees)), size = 20, replace = FALSE)
trees_train <- trees[train_idx, ]
nrow(trees_train)
## [1] 20
trees_test <- trees[-train_idx, ]
nrow(trees_test)
## [1] 11

Training the models

m0_train <- lm(Volume ~ Girth, data = trees_train)
m1_train <- lm(Volume ~ poly(Girth, 2), data = trees_train)
m2_train <- lm(Volume ~ ns(Girth, 15), data = trees_train)
plotModel(m0_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")

plotModel(m1_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")

plotModel(m2_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")

Testing the models

cor(predict(m0_train, newdata = trees_test), trees_test$Volume)^2
## [1] 0.98
cor(predict(m1_train, newdata = trees_test), trees_test$Volume)^2
## [1] 0.97
cor(predict(m2_train, newdata = trees_test), trees_test$Volume)^2
## [1] 0.016
mean((predict(m0_train, newdata = trees_test) - trees_test$Volume)^2)
## [1] 40
mean((predict(m1_train, newdata = trees_test) - trees_test$Volume)^2)
## [1] 17
mean((predict(m2_train, newdata = trees_test) - trees_test$Volume)^2)
## [1] 2074

Summary

Cross-validation

Testing predictive ability

Cross-validation

Example

Repeated CV

Cross-validation in R

library(caret)

https://cran.r-project.org/package=caret

https://topepo.github.io/caret/

train_control <- trainControl(method = "repeatedcv",
                              # k-fold CV
                              number = 4,
                              # repeated five times
                              repeats = 5)
set.seed(1)
m0_cv <- train(Volume ~ Girth, data = trees, trControl = train_control, method = "lm")
m1_cv <- train(Volume ~ poly(Girth, 2), data = trees, trControl = train_control, method = "lm")
m2_cv <- train(Volume ~ ns(Girth, 15), data = trees, trControl = train_control, method = "lm")

Result of cross-validation

m0_cv
## Linear Regression 
## 
## 31 samples
##  1 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 5 times) 
## Summary of sample sizes: 24, 23, 23, 23, 23, 23, ... 
## Resampling results:
## 
##   RMSE  Rsquared  MAE
##   4.5   0.95      3.7
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

More on RMSE

m0_cv$resample
##    RMSE Rsquared MAE   Resample
## 1   4.7     0.91 4.2 Fold1.Rep1
## 2   4.2     0.91 3.6 Fold2.Rep1
## 3   5.3     0.99 3.3 Fold3.Rep1
## 4   4.7     0.96 4.2 Fold4.Rep1
## 5   3.8     0.93 3.2 Fold1.Rep2
## 6   5.6     0.93 4.6 Fold2.Rep2
## 7   2.7     0.97 2.0 Fold3.Rep2
## 8   5.0     0.95 4.7 Fold4.Rep2
## 9   3.5     0.97 2.7 Fold1.Rep3
## 10  4.7     0.93 4.2 Fold2.Rep3
## 11  4.6     0.96 3.6 Fold3.Rep3
## 12  5.2     0.96 4.2 Fold4.Rep3
## 13  4.4     0.93 3.6 Fold1.Rep4
## 14  5.3     0.95 4.2 Fold2.Rep4
## 15  3.5     0.96 2.7 Fold3.Rep4
## 16  4.1     0.95 3.4 Fold4.Rep4
## 17  3.7     0.98 3.0 Fold1.Rep5
## 18  4.4     0.92 3.7 Fold2.Rep5
## 19  6.2     0.98 4.6 Fold3.Rep5
## 20  4.8     0.92 4.2 Fold4.Rep5
mean(m0_cv$resample$RMSE)
## [1] 4.5
m0_cv$results$RMSE
## [1] 4.5

Model comparison

m0_cv$results$RMSE
## [1] 4.5
m1_cv$results$RMSE
## [1] 3.5
m2_cv$results$RMSE
## [1] 86

Non-parametric bootstrap

Sampling variability

Bootstrap principle

index<-c(1,2,3,4,5)
set.seed(1)
boot_index<-sample(index, replace = TRUE)
boot_index
## [1] 1 4 1 2 5

Bootstrap data example

m0 <- lm(Volume ~ Girth, data = trees)
coef(m0)
## (Intercept)       Girth 
##       -36.9         5.1
model_coef <- function(index){
  coef(lm(Volume ~ Girth, data = trees, subset = index))
}
model_coef(1:nrow(trees))
## (Intercept)       Girth 
##       -36.9         5.1
set.seed(1)
model_coef(sample(1:nrow(trees), replace = TRUE))
## (Intercept)       Girth 
##       -29.8         4.5
model_coef(sample(1:nrow(trees), replace = TRUE))
## (Intercept)       Girth 
##       -34.4         4.8

Bootstrap data example - continued

set.seed(1)
bootstrap_coefs <- replicate(1000, {
  model_coef(sample(1:nrow(trees), replace = TRUE))
})
bootstrap_coefs[, 1:10]
##              [,1]  [,2]  [,3]  [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10]
## (Intercept) -29.8 -34.4 -34.3 -42.6  -36 -35.0 -36.5 -39.4 -34.0 -32.1
## Girth         4.5   4.8   4.8   5.5    5   4.9   5.1   5.2   4.8   4.7

Bootstrap estimates for the standard error

apply(bootstrap_coefs, 1, sd) # applies the function sd to each row in the matrix bootstrap_coefs
## (Intercept)       Girth 
##        3.98        0.32
summary(m0)
## 
## Call:
## lm(formula = Volume ~ Girth, data = trees)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -36.943      3.365   -11.0  7.6e-12 ***
## Girth          5.066      0.247    20.5  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.2 on 29 degrees of freedom
## Multiple R-squared:  0.935,  Adjusted R-squared:  0.933 
## F-statistic:  419 on 1 and 29 DF,  p-value: <2e-16

The boot package

library(boot)
model_coef_boot <- function(data, index){
  coef(lm(Volume ~ Girth, data = data, subset = index))
}
set.seed(1)
b <- boot(trees, model_coef_boot, R = 1000)
b
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = trees, statistic = model_coef_boot, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*    -36.9   0.372        4.05
## t2*      5.1  -0.038        0.33
coef(summary(m0))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -36.9       3.37     -11  7.6e-12
## Girth            5.1       0.25      20  8.6e-19

Bootstrap by resampling residuals

m0 <- lm(Volume ~ Girth, data = trees)
set.seed(1)
res_bootstrap_coefs <- replicate(1000, {
  new_y <- m0$fitted.values + sample(m0$residuals, replace = TRUE)
  coef(lm(new_y ~ trees$Girth))
})
res_bootstrap_coefs[, 1:10]
##              [,1]  [,2]  [,3] [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10]
## (Intercept) -38.4 -38.5 -34.0  -36  -35 -31.7 -38.6 -40.8 -30.5 -38.8
## trees$Girth   5.2   5.1   4.8    5    5   4.6   5.2   5.3   4.6   5.2
apply(res_bootstrap_coefs, 1, sd)
## (Intercept) trees$Girth 
##        3.31        0.24
apply(bootstrap_coefs, 1, sd)
## (Intercept)       Girth 
##        3.98        0.32
coef(summary(m0))
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -36.9       3.37     -11  7.6e-12
## Girth            5.1       0.25      20  8.6e-19