---
title: "Resampling techniques"
author: "The ASTA team"
output:
  slidy_presentation:
    fig_caption: no
    highlight: tango
    theme: cerulean
  pdf_document:
    fig_caption: no
    keep_tex: yes
    highlight: tango
    number_section: yes
    toc: yes

---

```{r, include = FALSE}
options(digits = 2)
## Remember to add all packages used in the code below!
#missing_pkgs <- setdiff(c("mosaic","tidyr", "tidyverse", "equatiomatic", "caret", "boot"),
missing_pkgs <- setdiff(c("mosaic", "stringr", "tibble", "tidyr", "dplyr", "equatiomatic", "caret", "boot", "splines"), 
                        rownames(installed.packages()))
if(length(missing_pkgs)>0) install.packages(missing_pkgs)
library(tibble)
library(tidyr)
library(dplyr)
```


# Resampling techniques


* Generalisation - how well a model performs on a new sample
  + Avoid overfitting
* Cross-validation (estimate out-of-sample prediction error)
* Bootstrap (estimate standard errors)

# Signal or noise?

```{r, message=FALSE}
library(mosaic)
trees <- read.delim("https://asta.math.aau.dk/datasets?file=trees.txt")
head(trees)
```

```{r}
m0 <- lm(Volume ~ Girth, data = trees)
plotModel(m0)
summary(m0)$r.squared
```

```{r}
library(equatiomatic)
extract_eq(m0)
extract_eq(m0, use_coefs = TRUE)
```

----

```{r}
m1 <- lm(Volume ~ poly(Girth, 2), data = trees)
plotModel(m1)
summary(m1)$r.squared
```

----

```{r}
m1_bad <- lm(Volume ~ poly(Girth, 15), data = trees)
plotModel(m1_bad)
```

----

```{r}
library(splines)
m2 <- lm(Volume ~ ns(Girth, 15), data = trees)
plotModel(m2)
```


----

$R^2$ and correlation:

```{r}
summary(m0)$r.squared
summary(m1)$r.squared
summary(m2)$r.squared
```


```{r}
# or: m0$fitted
cor(predict(m0, newdata = trees), trees$Volume)^2
cor(predict(m1, newdata = trees), trees$Volume)^2
cor(predict(m2, newdata = trees), trees$Volume)^2
```

----

Mean squared error (MSE):

```{r}
mean((predict(m0, newdata = trees) - trees$Volume)^2)
mean((predict(m1, newdata = trees) - trees$Volume)^2)
mean((predict(m2, newdata = trees) - trees$Volume)^2)
```

----

## Reproducibility and random number generation

```{r}
rnorm(3)
rnorm(3)
rnorm(3)
set.seed(1)
rnorm(3)
set.seed(1)
rnorm(3)
```

----

## Out-of-sample error

```{r}
nrow(trees)
set.seed(1)
train_idx <- sample(x = seq_len(nrow(trees)), size = 20, replace = FALSE)
trees_train <- trees[train_idx, ]
nrow(trees_train)
trees_test <- trees[-train_idx, ]
nrow(trees_test)
```

----

Using `trees_train` for training:

```{r}
m0_train <- lm(Volume ~ Girth, data = trees_train)
m1_train <- lm(Volume ~ poly(Girth, 2), data = trees_train)
m2_train <- lm(Volume ~ ns(Girth, 15), data = trees_train)
```

----

Using `trees_test` for testing ("out-of-sample" error):

```{r}
cor(predict(m0_train, newdata = trees_test), trees_test$Volume)^2
cor(predict(m1_train, newdata = trees_test), trees_test$Volume)^2
cor(predict(m2_train, newdata = trees_test), trees_test$Volume)^2
```


```{r}
mean((predict(m0_train, newdata = trees_test) - trees_test$Volume)^2)
mean((predict(m1_train, newdata = trees_test) - trees_test$Volume)^2)
mean((predict(m2_train, newdata = trees_test) - trees_test$Volume)^2)
```

----

```{r}
plotModel(m0_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")
```

----

```{r}
plotModel(m1_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")
```

----

```{r}
plotModel(m2_train) + 
  geom_point(aes(Girth, Volume), data = trees_test, color = "red")
```


# Cross-validation

* Repeat out-of-sample error estimation multiple times
* Resampling without replacement (partitioning of data)
* $k$-fold cross validation for $k=10$:

```{r, echo=FALSE, fig.width=10, fig.height=8}
d <- expand.grid(fold = 1:10, test = 1:10) |> 
  as_tibble() |> 
  mutate(label = case_when(
    test == fold ~ "Test data",
    TRUE ~ "Training data"
  ))
ggplot(d, aes(xmin = fold-0.4, xmax = fold+0.4, ymin = test, ymax = test+1)) +
  geom_rect(aes(fill = label)) + 
  geom_text(aes(fold, test + 0.5, label = test)) +
  theme_void() +
  labs(fill = NULL)
```

----

```{r}
nrow(trees)
seq_len(nrow(trees))
```

* Number of folds depends on size of dataset
* 4-fold cross validation.

Divide data into folds:

```{r, echo=FALSE, fig.width=5, fig.height=8}
set.seed(1)
d_trees_raw <- tibble(idx = split(seq_len(nrow(trees)), sample(rep(1:4, length.out = nrow(trees))))) 

d_trees_raw |> 
  mutate(test = row_number()) |> 
  rowwise() |> 
  mutate(indices = paste0(idx, collapse = ",")) |> 
  ggplot(aes(xmin = 1, xmax = 2, ymin = test, ymax = test+1)) +
  geom_rect(fill = "white", color = "black") + 
  geom_text(aes(1.5, test + 0.5, label = indices)) +
  theme_void() +
  labs(fill = NULL)
```

"Rotate" which folds are training data and which one is test data:

```{r, echo=FALSE, fig.width=10, fig.height=8}
d_trees <- d_trees_raw |> 
  mutate(test = row_number()) |>
  mutate(fold = list(1:4)) |> 
  unnest(fold) |> 
  rowwise() |> 
  mutate(indices = paste0(idx, collapse = ",")) |> 
  mutate(label = case_when(
    test == fold ~ "Test data",
    TRUE ~ "Training data"
  )) 

ggplot(d_trees, aes(xmin = fold-0.4, xmax = fold+0.4, ymin = test, ymax = test+1)) +
  geom_rect(aes(fill = label)) + 
  geom_text(aes(fold, test + 0.5, label = indices)) +
  theme_void() +
  labs(fill = NULL)
```

----

## Repeated CV

```{r, echo=FALSE, fig.width=10, fig.height=8}
set.seed(1)
d_trees_raw <- tibble(repeat_no = 1:5) |> 
  group_by(repeat_no) |> 
  mutate(idx = list(split(seq_len(nrow(trees)), sample(rep(1:4, length.out = nrow(trees)))))) |> 
  unnest(idx)

d_trees_raw |> 
  group_by(repeat_no) |> 
  mutate(test = row_number()) |> 
  rowwise() |> 
  mutate(indices = paste0(idx, collapse = ",")) |> 
  ggplot(aes(xmin = repeat_no-0.4, xmax = repeat_no+0.4, ymin = test, ymax = test+1)) +
  geom_rect(fill = "white", color = "black") + 
  geom_text(aes(repeat_no, test + 0.5, label = indices)) +
  theme_void() +
  labs(fill = NULL)
```


---- 

```{r, message=FALSE}
library(caret)
```

<https://cran.r-project.org/package=caret>
<https://topepo.github.io/caret/>

Setting up cross validation:

```{r}
train_control <- trainControl(method = "repeatedcv",
                              # k-fold CV
                              number = 4,
                              # repeated ten times
                              repeats = 10)
```

```{r}
set.seed(1)
m0_cv <- train(Volume ~ Girth, data = trees, trControl = train_control, method = "lm")
m1_cv <- train(Volume ~ poly(Girth, 2), data = trees, trControl = train_control, method = "lm")
m2_cv <- train(Volume ~ ns(Girth, 15), data = trees, trControl = train_control, method = "lm")
```

----

```{r}
m0_cv
```

* **RMSE** is root mean squared error

----

```{r}
m0_cv$resample
mean(m0_cv$resample$RMSE)
m0_cv$results$RMSE
```

----

```{r}
m0_cv$results$RMSE
m1_cv$results$RMSE
m2_cv$results$RMSE
```

----

Extra (with `tidyverse`, remember `library(tidyverse)`):

```{r}
d <- bind_rows(
  m0_cv$resample |> mutate(Model = "Volume ~ Girth"),
  m1_cv$resample |> mutate(Model = "Volume ~ poly(Girth, 2)")
)
d_mean <- d |> 
  group_by(Model) |> 
  summarise(RMSE = mean(RMSE), .groups = "drop")

ggplot(d, aes(Model, RMSE)) + 
  geom_boxplot() + 
  geom_jitter(height = 0, width = 0.1) + 
  geom_point(data = d_mean, color = "red", shape = 4, size = 5)
```


# Non-parametric bootstrap

* Resampling data with replacement (some data is used, some not)
* Mimic a new sample 

```{r}
m0 <- lm(Volume ~ Girth, data = trees)
coef(m0)
```

Preparing bootstrap:

```{r}
model_coef <- function(index){
  coef(lm(Volume ~ Girth, data = trees, subset = index))
}
model_coef(1:nrow(trees))
model_coef(c(rep(1, 10), 11:nrow(trees)))
set.seed(1)
model_coef(sample(1:nrow(trees), replace = TRUE))
model_coef(sample(1:nrow(trees), replace = TRUE))
```

----

```{r}
set.seed(1)
bootstrap_coefs <- replicate(1000, {
  model_coef(sample(1:nrow(trees), replace = TRUE))
})
bootstrap_coefs[, 1:10]
dim(bootstrap_coefs)
```

----

```{r}
apply(bootstrap_coefs, 1, sd)
coef(summary(m0))
```

```{r}
gf_point(Volume ~ Girth, data = trees) %>%
  gf_abline(intercept = coef(m0)[1], slope = coef(m0)[2], color = "red") %>%
  gf_abline(intercept = bootstrap_coefs[1, 1], slope = bootstrap_coefs[2, 1], color = "black") %>%
  gf_abline(intercept = bootstrap_coefs[1, 2], slope = bootstrap_coefs[2, 2], color = "black")
```

----

```{r}
p <- gf_point(Volume ~ Girth, data = trees) 

#for (i in 1:ncol(bootstrap_coefs)) {
for (i in 1:10) {
  p <- p %>% 
    gf_abline(intercept = bootstrap_coefs[1, i], slope = bootstrap_coefs[2, i], color = "darkgrey") 
}

p %>% gf_abline(intercept = coef(m0)[1], slope = coef(m0)[2], color = "red")
```

----

Also see the `boot` package: 
<https://cran.r-project.org/package=boot>

```{r, message=FALSE}
library(boot)
```

```{r}
model_coef_boot <- function(data, index){
  coef(lm(Volume ~ Girth, data = data, subset = index))
}
```

```{r}
set.seed(1)
b <- boot(trees, model_coef_boot, R = 1000)
b
coef(summary(m0))
```

# Parametric bootstrap by resampling residuals

Resampling residuals with replacement:

```{r, message=FALSE}
library(boot)
```

```{r}
m0 <- lm(Volume ~ Girth, data = trees)
```

```{r}
set.seed(1)
parbootstrap_coefs <- replicate(1000, {
  new_y <- m0$fitted.values + sample(m0$residuals, replace = TRUE)
  coef(lm(new_y ~ trees$Girth))
})
parbootstrap_coefs[, 1:10]
apply(parbootstrap_coefs, 1, sd)
apply(bootstrap_coefs, 1, sd)
coef(summary(m0))
```