Analysis of Covariance

The ASTA team

The regression problem

Example

Income <- read.delim("https://asta.math.aau.dk/datasets?file=Income.txt")
library(mosaic)
gf_point(inc ~ educ, color = ~race, data = Income) %>% gf_lm()
## Warning: Using the `size` aesthetic with geom_line was deprecated in ggplot2 3.4.0.
## â„ą Please use the `linewidth` aesthetic instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Dummy coding

Dummy coding

Example

Income$race <- factor(Income$race)
Income$race <- relevel(Income$race, "w")
model1 <- lm(inc ~ educ + race, data = Income)
summary(model1)
## 
## Call:
## lm(formula = inc ~ educ + race, data = Income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.664  -9.622  -1.642   6.552  57.620 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.6635     8.4121  -1.862   0.0665 .  
## educ          4.4317     0.6191   7.158 4.42e-10 ***
## raceb       -10.8744     4.4730  -2.431   0.0174 *  
## raceh        -4.9338     4.7632  -1.036   0.3036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 76 degrees of freedom
## Multiple R-squared:  0.462,  Adjusted R-squared:  0.4408 
## F-statistic: 21.75 on 3 and 76 DF,  p-value: 2.853e-10
plotModel(model1)

Example: Prediction equations

summary(model1)
## 
## Call:
## lm(formula = inc ~ educ + race, data = Income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.664  -9.622  -1.642   6.552  57.620 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.6635     8.4121  -1.862   0.0665 .  
## educ          4.4317     0.6191   7.158 4.42e-10 ***
## raceb       -10.8744     4.4730  -2.431   0.0174 *  
## raceh        -4.9338     4.7632  -1.036   0.3036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 76 degrees of freedom
## Multiple R-squared:  0.462,  Adjusted R-squared:  0.4408 
## F-statistic: 21.75 on 3 and 76 DF,  p-value: 2.853e-10

Agresti – summary

Model with interaction

Interaction

Interaction

Example: Prediction equations

model2 <- lm(inc ~ educ * race, data = Income)
summary(model2)
## 
## Call:
## lm(formula = inc ~ educ * race, data = Income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.064  -9.448  -1.453   6.167  56.936 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -25.8688    10.4982  -2.464   0.0161 *  
## educ          5.2095     0.7828   6.655  4.3e-09 ***
## raceb        19.3333    18.2928   1.057   0.2940    
## raceh         9.2640    24.2797   0.382   0.7039    
## educ:raceb   -2.4107     1.4177  -1.700   0.0933 .  
## educ:raceh   -1.1208     2.0060  -0.559   0.5781    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.37 on 74 degrees of freedom
## Multiple R-squared:  0.4825, Adjusted R-squared:  0.4475 
## F-statistic:  13.8 on 5 and 74 DF,  p-value: 1.618e-09

Example: Individual tests

summary(model2)
## 
## Call:
## lm(formula = inc ~ educ * race, data = Income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.064  -9.448  -1.453   6.167  56.936 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -25.8688    10.4982  -2.464   0.0161 *  
## educ          5.2095     0.7828   6.655  4.3e-09 ***
## raceb        19.3333    18.2928   1.057   0.2940    
## raceh         9.2640    24.2797   0.382   0.7039    
## educ:raceb   -2.4107     1.4177  -1.700   0.0933 .  
## educ:raceh   -1.1208     2.0060  -0.559   0.5781    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.37 on 74 degrees of freedom
## Multiple R-squared:  0.4825, Adjusted R-squared:  0.4475 
## F-statistic:  13.8 on 5 and 74 DF,  p-value: 1.618e-09
gf_point(inc ~ educ, color = ~ race, data = Income) %>% gf_lm()

Test for no interaction

Test for no interaction

plotModel(model1)

plotModel(model2)

summary(model1)$r.squared
## [1] 0.4619906
summary(model2)$r.squared
## [1] 0.4824822

Hypothesis and test statistic

Test for no interaction in R

anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: inc ~ educ + race
## Model 2: inc ~ educ * race
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)
## 1     76 18164                          
## 2     74 17472  2    691.84 1.465 0.2377

Hierarchy of models

Hierarchy of models

Interaction <- lm(inc ~ educ * race, data = Income)
MainEffects <- lm(inc ~ educ + race, data = Income)
educEff <- lm(inc ~ educ, data = Income)
raceEff <- lm(inc ~ race, data = Income)

Example

anova(MainEffects, Interaction)
## Analysis of Variance Table
## 
## Model 1: inc ~ educ + race
## Model 2: inc ~ educ * race
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)
## 1     76 18164                          
## 2     74 17472  2    691.84 1.465 0.2377
anova(educEff, MainEffects)
## Analysis of Variance Table
## 
## Model 1: inc ~ educ
## Model 2: inc ~ educ + race
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1     78 19625                              
## 2     76 18164  2    1460.6 3.0556 0.05292 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(raceEff, MainEffects)
## Analysis of Variance Table
## 
## Model 1: inc ~ race
## Model 2: inc ~ educ + race
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1     77 30410                                  
## 2     76 18164  1     12245 51.235 4.422e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model with only a single categorical preditor

gf_point(inc ~ race, data = Income, color = ~race) %>% 
  gf_jitter(width = 0.1) %>% 
  gf_point(mean ~ race, data = favstats(inc ~ race, data = Income), 
           size = 6, shape = 4, stroke = 2, show.legend = FALSE)

summary(raceEff)
## 
## Call:
## lm(formula = inc ~ race, data = Income)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.48 -12.48  -4.48   7.52  77.52 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   42.480      2.810  15.115   <2e-16 ***
## raceb        -14.730      5.708  -2.581   0.0118 *  
## raceh        -11.480      6.009  -1.910   0.0598 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.87 on 77 degrees of freedom
## Multiple R-squared:  0.0993, Adjusted R-squared:  0.0759 
## F-statistic: 4.244 on 2 and 77 DF,  p-value: 0.01784

Example with more predictors

Ericksen <- read.delim("https://asta.math.aau.dk/datasets?file=Ericksen.txt")
model <- lm(crime ~ city * highschool + city * poverty, data = Ericksen)
summary(model)
## 
## Call:
## lm(formula = crime ~ city * highschool + city * poverty, data = Ericksen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.741  -8.745  -1.557   7.820  47.470 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           61.1456    18.1254   3.373 0.001305 ** 
## citystate             18.1526    20.4131   0.889 0.377413    
## highschool            -1.5711     0.6062  -2.592 0.011979 *  
## poverty                5.3105     1.4333   3.705 0.000463 ***
## citystate:highschool   0.7025     0.7327   0.959 0.341523    
## citystate:poverty     -5.1862     1.6619  -3.121 0.002773 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.16 on 60 degrees of freedom
## Multiple R-squared:  0.6577, Adjusted R-squared:  0.6292 
## F-statistic: 23.06 on 5 and 60 DF,  p-value: 7.748e-13