Packages

library(MASS)
library(tidyverse)
library(broom)
library(modelr)
theme_set(theme_bw(base_size = 16))

Nesting data

Gapminder data

library(gapminder)
gapminder %>% 
  filter(country == "Denmark")
# A tibble: 12 x 6
   country continent  year lifeExp     pop gdpPercap
   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
 1 Denmark Europe     1952    70.8 4334000     9692.
 2 Denmark Europe     1957    71.8 4487831    11100.
 3 Denmark Europe     1962    72.4 4646899    13583.
 4 Denmark Europe     1967    73.0 4838800    15937.
 5 Denmark Europe     1972    73.5 4991596    18866.
 6 Denmark Europe     1977    74.7 5088419    20423.
 7 Denmark Europe     1982    74.6 5117810    21688.
 8 Denmark Europe     1987    74.8 5127024    25116.
 9 Denmark Europe     1992    75.3 5171393    26407.
10 Denmark Europe     1997    76.1 5283663    29804.
11 Denmark Europe     2002    77.2 5374693    32167.
12 Denmark Europe     2007    78.3 5468120    35278.
gapminder %>% 
  ggplot(aes(x = year, y = lifeExp, colour = country)) + 
  geom_line() + facet_wrap(~continent) + 
  scale_color_manual(values = country_colors) +
  guides(colour = FALSE)

Nested columns

gapminder_nest <- gapminder %>% 
  group_by(continent, country) %>% 
  nest(.key = "country_data")

gapminder_nest
# A tibble: 142 x 3
   continent country     country_data     
   <fct>     <fct>       <list>           
 1 Asia      Afghanistan <tibble [12 × 4]>
 2 Europe    Albania     <tibble [12 × 4]>
 3 Africa    Algeria     <tibble [12 × 4]>
 4 Africa    Angola      <tibble [12 × 4]>
 5 Americas  Argentina   <tibble [12 × 4]>
 6 Oceania   Australia   <tibble [12 × 4]>
 7 Europe    Austria     <tibble [12 × 4]>
 8 Asia      Bahrain     <tibble [12 × 4]>
 9 Asia      Bangladesh  <tibble [12 × 4]>
10 Europe    Belgium     <tibble [12 × 4]>
# … with 132 more rows

Just looking at the Danish data

gapminder_nest %>% filter(country == "Denmark")
# A tibble: 1 x 3
  continent country country_data     
  <fct>     <fct>   <list>           
1 Europe    Denmark <tibble [12 × 4]>

Several ways to look at specific data

gapminder_nest %>% 
  filter(country == "Denmark") %>% 
  pull(country_data)
[[1]]
# A tibble: 12 x 4
    year lifeExp     pop gdpPercap
   <int>   <dbl>   <int>     <dbl>
 1  1952    70.8 4334000     9692.
 2  1957    71.8 4487831    11100.
 3  1962    72.4 4646899    13583.
 4  1967    73.0 4838800    15937.
 5  1972    73.5 4991596    18866.
 6  1977    74.7 5088419    20423.
 7  1982    74.6 5117810    21688.
 8  1987    74.8 5127024    25116.
 9  1992    75.3 5171393    26407.
10  1997    76.1 5283663    29804.
11  2002    77.2 5374693    32167.
12  2007    78.3 5468120    35278.
gapminder_nest %>% 
  filter(country == "Denmark") %>% 
  .$country_data
[[1]]
# A tibble: 12 x 4
    year lifeExp     pop gdpPercap
   <int>   <dbl>   <int>     <dbl>
 1  1952    70.8 4334000     9692.
 2  1957    71.8 4487831    11100.
 3  1962    72.4 4646899    13583.
 4  1967    73.0 4838800    15937.
 5  1972    73.5 4991596    18866.
 6  1977    74.7 5088419    20423.
 7  1982    74.6 5117810    21688.
 8  1987    74.8 5127024    25116.
 9  1992    75.3 5171393    26407.
10  1997    76.1 5283663    29804.
11  2002    77.2 5374693    32167.
12  2007    78.3 5468120    35278.
gapminder_nest %>% 
  filter(country == "Denmark") %>% 
  unnest() ## adds the grouping columns back on
# A tibble: 12 x 6
   continent country  year lifeExp     pop gdpPercap
   <fct>     <fct>   <int>   <dbl>   <int>     <dbl>
 1 Europe    Denmark  1952    70.8 4334000     9692.
 2 Europe    Denmark  1957    71.8 4487831    11100.
 3 Europe    Denmark  1962    72.4 4646899    13583.
 4 Europe    Denmark  1967    73.0 4838800    15937.
 5 Europe    Denmark  1972    73.5 4991596    18866.
 6 Europe    Denmark  1977    74.7 5088419    20423.
 7 Europe    Denmark  1982    74.6 5117810    21688.
 8 Europe    Denmark  1987    74.8 5127024    25116.
 9 Europe    Denmark  1992    75.3 5171393    26407.
10 Europe    Denmark  1997    76.1 5283663    29804.
11 Europe    Denmark  2002    77.2 5374693    32167.
12 Europe    Denmark  2007    78.3 5468120    35278.

Wrapping model fits over groups

R has a long tradition of using lists (and nested lists). Hence, there is a large group of functions that are designed for computing efficiently on these objects.

mtcars_cyl <- mtcars %>% 
  rownames_to_column(var = "car") %>% 
  as_tibble() %>% 
  split(.$cyl)

Now for each cylinder grouping we could be interested in fitting the linear relationship between mpg and hp

mtcars_cyl4_fit <- lm(mpg ~ hp, data = mtcars_cyl$`4`)
mtcars_cyl6_fit <- lm(mpg ~ hp, data = mtcars_cyl$`6`)
mtcars_cyl8_fit <- lm(mpg ~ hp, data = mtcars_cyl$`8`)

However, for several subgroups this approach becomes tedious. The family of apply functions are the traditional way for doing this. However, nowadays the purrr package provides similar functionality (in the tidyverse framework)

Using lapply

First we show how to do this using the lapply function (for reference only). Here, for ease of repeating the same fit we define the function lm_mpg_hp. Note, this could also had made sense above when doing the fits manually.

lm_mpg_hp <- function(df){
  lm(mpg ~ hp, data = df)
}
mtcars_cyl_fit <- lapply(mtcars_cyl, lm_mpg_hp)
mtcars_cyl_fit %>% lapply(coef)
$`4`
(Intercept)          hp 
 35.9830256  -0.1127759 

$`6`
 (Intercept)           hp 
20.673851133 -0.007613269 

$`8`
(Intercept)          hp 
18.08007371 -0.01424412 
mtcars_cyl_fit %>% lapply(summary)
$`4`

Call:
lm(formula = mpg ~ hp, data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-4.591 -2.582 -1.240  2.071  7.161 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.98303    5.20130   6.918 6.93e-05 ***
hp          -0.11278    0.06118  -1.843   0.0984 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.05 on 9 degrees of freedom
Multiple R-squared:  0.2741,    Adjusted R-squared:  0.1934 
F-statistic: 3.398 on 1 and 9 DF,  p-value: 0.0984


$`6`

Call:
lm(formula = mpg ~ hp, data = df)

Residuals:
      1       2       3       4       5       6       7 
 1.1636  1.1636  1.5636 -1.7745 -0.5374 -1.9374  0.3585 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) 20.673851   3.304429   6.256  0.00153 **
hp          -0.007613   0.026578  -0.286  0.78602   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.579 on 5 degrees of freedom
Multiple R-squared:  0.01615,   Adjusted R-squared:  -0.1806 
F-statistic: 0.08206 on 1 and 5 DF,  p-value: 0.786


$`8`

Call:
lm(formula = mpg ~ hp, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.7600 -0.6685 -0.1971  1.6389  3.6126 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 18.08007    2.98756   6.052 5.74e-05 ***
hp          -0.01424    0.01390  -1.025    0.326    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.555 on 12 degrees of freedom
Multiple R-squared:  0.08045,   Adjusted R-squared:  0.00382 
F-statistic:  1.05 on 1 and 12 DF,  p-value: 0.3258

Using purrr

Using purrr this is done by the map function. Warning! Because of its name the purrr::map function is occasionally masked by the maps::map function. This results in unintended behavior. Hence, I often put the namespace purrr:: in front of map to make sure this does not happen in my R sessions.

mtcars_cyl %>% purrr::map(.f = ~.x %>% lm_mpg_hp())
$`4`

Call:
lm(formula = mpg ~ hp, data = df)

Coefficients:
(Intercept)           hp  
    35.9830      -0.1128  


$`6`

Call:
lm(formula = mpg ~ hp, data = df)

Coefficients:
(Intercept)           hp  
  20.673851    -0.007613  


$`8`

Call:
lm(formula = mpg ~ hp, data = df)

Coefficients:
(Intercept)           hp  
   18.08007     -0.01424  

Note here the ~.x syntax. This ensures that the map function can access the data. The .x is the data argument to purrr::map and the ~ is needed here due to technical requirements (NSE: non-standard evaluation).

Back to gapminder data

When an analysis gets complex and keeping track of fitted models and their association to data, it may be ideal to keep the models and data next to each other. Fortunately the list columns of tibbles allows for this. Above we saw that list columns can hold entire tibbles and cell entries. Other types of objects, e.g. list, vector and model objects (like lm objects can be stored).

Instead of defining a function in the global environment, we can also just specify the operations on the fly. Here the syntactical ~ is very important.

gapminder_nest_lm <- gapminder_nest %>% 
  mutate(lm_lifeExp_year = purrr::map(.x = country_data, .f = ~lm(lifeExp ~ year, data = .x)))
gapminder_nest_lm
# A tibble: 142 x 4
   continent country     country_data      lm_lifeExp_year
   <fct>     <fct>       <list>            <list>         
 1 Asia      Afghanistan <tibble [12 × 4]> <S3: lm>       
 2 Europe    Albania     <tibble [12 × 4]> <S3: lm>       
 3 Africa    Algeria     <tibble [12 × 4]> <S3: lm>       
 4 Africa    Angola      <tibble [12 × 4]> <S3: lm>       
 5 Americas  Argentina   <tibble [12 × 4]> <S3: lm>       
 6 Oceania   Australia   <tibble [12 × 4]> <S3: lm>       
 7 Europe    Austria     <tibble [12 × 4]> <S3: lm>       
 8 Asia      Bahrain     <tibble [12 × 4]> <S3: lm>       
 9 Asia      Bangladesh  <tibble [12 × 4]> <S3: lm>       
10 Europe    Belgium     <tibble [12 × 4]> <S3: lm>       
# … with 132 more rows
gapminder_nest_lm_Denmark <- gapminder_nest_lm %>% 
  filter(country == "Denmark")

summary(gapminder_nest_lm_Denmark$lm_lifeExp_year[[1]])

Call:
lm(formula = lifeExp ~ year, data = .x)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.55679 -0.28605  0.04486  0.12228  0.62526 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -165.80271   13.20311  -12.56 1.90e-07 ***
year           0.12133    0.00667   18.19 5.41e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3988 on 10 degrees of freedom
Multiple R-squared:  0.9707,    Adjusted R-squared:  0.9677 
F-statistic: 330.9 on 1 and 10 DF,  p-value: 5.41e-09

Tidy modeling

The modelr and broom packages

modelr

The modelr package provides several useful helper functions:

  • add_residuals()
  • add_predictions()
  • bootstrap()
  • crossv_kfold()

We can add residuals and predictions to the data using the first two functions. The add_residuals takes two arguments - the data and the model. Hence, we need to run over two lists simultaneously. For this we use the map2 function that operates on .x and .y and applies the function .f

gapminder_nest_lm <- gapminder_nest_lm %>% 
  mutate(
    country_data = map2(country_data, lm_lifeExp_year, add_residuals),
    country_data = map2(country_data, lm_lifeExp_year, add_predictions)
  )

We may unnest these to obtain the data for e.g. plotting the residuals. However, first the irregular column of regressions must be dropped.

gapminder_lm <- gapminder_nest_lm %>% 
  select(-lm_lifeExp_year) %>% 
  unnest()
gapminder_lm %>% filter(country == "Denmark") %>% 
  ggplot(aes(x = seq_along(resid), y = resid)) +
  geom_point() + 
  geom_smooth()

broom

The broom package provides another set of nice features:

  • glance()
  • tidy()
  • augment()
gapminder_nest_lm <- gapminder_nest_lm %>% 
  mutate(
    country_data_aug = map(lm_lifeExp_year, augment)
  )

We see that it gives the same columns added to the data as add_predictions() and add_residuals() but with a simpler syntax.

gapminder_nest_lm %>% filter(country == "Denmark") %>% 
  select(-lm_lifeExp_year) %>% 
  unnest() %>% 
  select(-ends_with("1"))
# A tibble: 12 x 15
   continent country  year lifeExp    pop gdpPercap    resid  pred .fitted
   <fct>     <fct>   <int>   <dbl>  <int>     <dbl>    <dbl> <dbl>   <dbl>
 1 Europe    Denmark  1952    70.8 4.33e6     9692. -0.254    71.0    71.0
 2 Europe    Denmark  1957    71.8 4.49e6    11100.  0.170    71.6    71.6
 3 Europe    Denmark  1962    72.4 4.65e6    13583.  0.103    72.2    72.2
 4 Europe    Denmark  1967    73.0 4.84e6    15937.  0.106    72.9    72.9
 5 Europe    Denmark  1972    73.5 4.99e6    18866.  0.00981  73.5    73.5
 6 Europe    Denmark  1977    74.7 5.09e6    20423.  0.623    74.1    74.1
 7 Europe    Denmark  1982    74.6 5.12e6    21688. -0.0435   74.7    74.7
 8 Europe    Denmark  1987    74.8 5.13e6    25116. -0.480    75.3    75.3
 9 Europe    Denmark  1992    75.3 5.17e6    26407. -0.557    75.9    75.9
10 Europe    Denmark  1997    76.1 5.28e6    29804. -0.383    76.5    76.5
11 Europe    Denmark  2002    77.2 5.37e6    32167.  0.0799   77.1    77.1
12 Europe    Denmark  2007    78.3 5.47e6    35278.  0.625    77.7    77.7
# … with 6 more variables: .se.fit <dbl>, .resid <dbl>, .hat <dbl>,
#   .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>

By the glance and tidy functions it is straight forward to obtain summary output and parameter estimates in a tidy format.

First some standard model summary information from based on the glance function.

gapminder_nest_lm %>% 
  mutate(model_summary = map(lm_lifeExp_year, glance)) %>% 
  select(continent:country, model_summary) %>% 
  unnest()
# A tibble: 142 x 13
   continent country r.squared adj.r.squared sigma statistic  p.value    df
   <fct>     <fct>       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>
 1 Asia      Afghan…     0.948         0.942 1.22      181.  9.84e- 8     2
 2 Europe    Albania     0.911         0.902 1.98      102.  1.46e- 6     2
 3 Africa    Algeria     0.985         0.984 1.32      662.  1.81e-10     2
 4 Africa    Angola      0.888         0.877 1.41       79.1 4.59e- 6     2
 5 Americas  Argent…     0.996         0.995 0.292    2246.  4.22e-13     2
 6 Oceania   Austra…     0.980         0.978 0.621     481.  8.67e-10     2
 7 Europe    Austria     0.992         0.991 0.407    1261.  7.44e-12     2
 8 Asia      Bahrain     0.967         0.963 1.64      291.  1.02e- 8     2
 9 Asia      Bangla…     0.989         0.988 0.977     930.  3.37e-11     2
10 Europe    Belgium     0.995         0.994 0.293    1822.  1.20e-12     2
# … with 132 more rows, and 5 more variables: logLik <dbl>, AIC <dbl>,
#   BIC <dbl>, deviance <dbl>, df.residual <int>

Under the hood the glance function calls the appropriate glace.MODEL_TYPE function. See the number of supported models by typing broom:::glance. in the console and [TAB] for auto-completion.

The tidy function behaves similar to summary()$coef function except it names the columns to be more assessible for later use, e.g. p.value instead of Pr(>|t|):

gapminder_nest_lm %>% 
  mutate(coefs = map(lm_lifeExp_year, tidy, quick = TRUE)) %>% 
  select(continent:country, coefs) %>% 
  unnest() %>% 
  spread(term, estimate)
# A tibble: 142 x 4
   continent country                  `(Intercept)`   year
   <fct>     <fct>                            <dbl>  <dbl>
 1 Africa    Algeria                        -1068.  0.569 
 2 Africa    Angola                          -377.  0.209 
 3 Africa    Benin                           -613.  0.334 
 4 Africa    Botswana                         -65.5 0.0607
 5 Africa    Burkina Faso                    -676.  0.364 
 6 Africa    Burundi                         -260.  0.154 
 7 Africa    Cameroon                        -447.  0.250 
 8 Africa    Central African Republic        -320.  0.184 
 9 Africa    Chad                            -455.  0.253 
10 Africa    Comoros                         -839.  0.450 
# … with 132 more rows

The use of quick = TRUE suppresses all the inference interesting output (e.g. p.value and parameter uncertainty).