Stat 410/510: Backwards, Forwards, and Stepwise variable selection 1

Stat 410/510: Backwards, Forwards, and Stepwise variable selection
> birds <- read.table("http://users.humboldt.edu//rizzardi//Data.dir/bird.txt",
+
header=T, skip=15)
> attach( birds )
>
> # Use age class (age, adult=1, juvenile=2), total length (tl), wing span (ae),
> # femur length (thigh bone, fl), and humerus (arm bone, hl) to predict weight
(wt)
> fit.all <- lm( wt ~ ae + ag + fl + hl + tl )
> summary( fit.all )
Call:
lm(formula = wt ~ ae + ag + fl + hl + tl)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -18.055780
7.640757 -2.363 0.020520 *
ae
0.006812
0.050789
0.134 0.893635
ag
-0.167180
0.273993 -0.610 0.543463
fl
-5.795307 11.288314 -0.513 0.609076
hl
24.462067 12.245164
1.998 0.049107 *
tl
0.177890
0.051351
3.464 0.000853 ***
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.162 on 81 degrees of freedom
Multiple R-squared: 0.3718,
Adjusted R-squared: 0.3331
F-statistic: 9.589 on 5 and 81 DF, p-value: 3.245e-07
1
> anova(fit.all)
Analysis of Variance Table
Response: wt
Df Sum Sq Mean Sq F value
Pr(>F)
ae
1 43.345 43.345 32.1142 2.156e-07 ***
ag
1
0.085
0.085 0.0629 0.8026650
fl
1
2.720
2.720 2.0151 0.1595742
hl
1
2.367
2.367 1.7534 0.1891771
tl
1 16.197 16.197 12.0008 0.0008527 ***
Residuals 81 109.326
1.350
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # can define a formula object
> formula.all <- formula(wt~ae+ag+fl+hl+tl)
> formula.all
wt ~ ae + ag + fl + hl + tl
> lm(formula.all) # same as fit.all
Call:
lm(formula = formula.all)
Coefficients:
(Intercept)
-18.055780
ae
0.006812
ag
-0.167180
fl
-5.795307
hl
24.462067
tl
0.177890
>
> drop1(fit.all) # uses AIC statistic as criteria
Single term deletions
Model:
wt ~ ae + ag + fl + hl + tl
Df Sum of Sq
RSS
AIC
<none>
109.33 31.873
ae
1
0.0243 109.35 29.892
ag
1
0.5025 109.83 30.272
fl
1
0.3557 109.68 30.156
hl
1
5.3864 114.71 34.057
tl
1
16.1975 125.52 41.893
> # AIC = n*log(RSS/n)+2p, where p=number of parameters
> # Goal: minimize AIC
> # confirming AIC for fit.all
> 87 * log( sum(residuals(fit.all)^2) / 87) + 2*6
[1] 31.87317
> extractAIC(fit.all) # same thing
[1] 6.00000 31.87317
>
2
> drop1(fit.all,test="F") # uses so called "type II" ANOVA - decrease in RSS.
Single term deletions
Model:
wt ~ ae + ag + fl + hl + tl
Df Sum of Sq
RSS
AIC F value
Pr(>F)
<none>
109.33 31.873
ae
1
0.0243 109.35 29.892 0.0180 0.8936349
ag
1
0.5025 109.83 30.272 0.3723 0.5434627
fl
1
0.3557 109.68 30.156 0.2636 0.6090761
hl
1
5.3864 114.71 34.057 3.9908 0.0491067 *
tl
1
16.1975 125.52 41.893 12.0008 0.0008527 ***
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> step( fit.all, direction="backward" ) # repeated uses drop1()
Start: AIC=31.87
wt ~ ae + ag + fl + hl + tl
- ae
- fl
- ag
<none>
- hl
- tl
Df Sum of Sq
RSS
AIC
1
0.0243 109.35 29.892
1
0.3557 109.68 30.156
1
0.5025 109.83 30.272
109.33 31.873
1
5.3864 114.71 34.057
1
16.1975 125.52 41.893
Step: AIC=29.89
wt ~ ag + fl + hl + tl
Df Sum of Sq
RSS
- fl
1
0.3429 109.69
- ag
1
0.5708 109.92
<none>
109.35
- hl
1
6.7828 116.13
- tl
1
23.3115 132.66
AIC
28.165
28.345
29.892
33.128
44.705
Step: AIC=28.16
wt ~ ag + hl + tl
- ag
<none>
- hl
- tl
Df Sum of Sq
RSS
1
0.5303 110.22
109.69
1
16.1467 125.84
1
23.2088 132.90
AIC
26.584
28.165
38.112
42.862
3
Step: AIC=26.58
wt ~ hl + tl
Df Sum of Sq
<none>
- hl
- tl
1
1
RSS
AIC
110.22 26.584
15.811 126.03 36.246
22.948 133.17 41.038
Call:
lm(formula = wt ~ hl + tl)
Coefficients:
(Intercept)
-17.0972
hl
20.0488
tl
0.1755
> step( fit.all, direction="backward", test="F", trace=F ) # selects same model
Call:
lm(formula = wt ~ hl + tl)
Coefficients:
(Intercept)
-17.0972
hl
20.0488
tl
0.1755
>
> # can define a formula object
> formula.all <- formula(wt~ae+ag+fl+hl+tl)
> formula.all
wt ~ ae + ag + fl + hl + tl
> lm(formula.all) # same as fit.all
Call:
lm(formula = formula.all)
Coefficients:
(Intercept)
-18.055780
ae
0.006812
ag
-0.167180
fl
-5.795307
hl
24.462067
tl
0.177890
>
4
> fit.null <- lm( wt ~ 1 )
> add1( fit.null, scope=formula.all ) # formula.all sets range of choices
Single term additions
Model:
wt ~ 1
Df Sum of Sq
RSS
AIC
<none>
174.04 62.324
ae
1
43.345 130.69 39.405
ag
1
0.010 174.03 64.319
fl
1
34.197 139.84 45.291
hl
1
40.868 133.17 41.038
tl
1
48.005 126.03 36.246
>
> # "forward" direction repeated uses add1()
> step( fit.null, direction="forward", scope=formula.all ) # same model again
Start: AIC=62.32
wt ~ 1
+ tl
+ ae
+ hl
+ fl
<none>
+ ag
Df Sum of Sq
RSS
AIC
1
48.005 126.03 36.246
1
43.345 130.69 39.405
1
40.868 133.17 41.038
1
34.197 139.84 45.291
174.04 62.324
1
0.010 174.03 64.319
Step: AIC=36.25
wt ~ tl
Df Sum of Sq
RSS
+ hl
1
15.8111 110.22
+ fl
1
9.5958 116.44
+ ae
1
8.4463 117.59
<none>
126.03
+ ag
1
0.1947 125.84
AIC
26.584
31.357
32.212
36.246
38.112
Step: AIC=26.58
wt ~ tl + hl
Df Sum of Sq
<none>
+ ag
+ fl
+ ae
1
1
1
RSS
110.22
0.53026 109.69
0.30238 109.92
0.06409 110.16
AIC
26.584
28.165
28.345
28.534
Call:
lm(formula = wt ~ tl + hl)
Coefficients:
(Intercept)
-17.0972
tl
0.1755
hl
20.0488
5
>
> # Use best of "forward" and "backward"
> fit.middle <- lm(wt~ ae+ag+tl)
> step( fit.middle, direction="both", scope=formula.all ) # same model again
Start: AIC=34.19
wt ~ ae + ag + tl
Df Sum of Sq
RSS
AIC
+ hl
1
7.8830 109.68 30.156
- ag
1
0.0236 117.59 32.212
+ fl
1
2.8523 114.71 34.057
<none>
117.56 34.194
- ae
1
8.2751 125.84 38.112
- tl
1
13.0452 130.61 41.349
Step: AIC=30.16
wt ~ ae + ag + tl +
Df Sum of Sq
- ae
1
0.0114
- ag
1
0.4776
<none>
+ fl
1
0.3557
- hl
1
7.8830
- tl
1
15.8422
RSS
109.69
110.16
109.68
109.33
117.56
125.52
AIC
28.165
28.534
30.156
31.873
34.194
39.893
Step: AIC=28.16
wt ~ ag + tl + hl
Df Sum of Sq
- ag
1
0.5303
<none>
+ fl
1
0.3429
+ ae
1
0.0114
- hl
1
16.1467
- tl
1
23.2088
RSS
110.22
109.69
109.35
109.68
125.84
132.90
AIC
26.584
28.165
29.892
30.156
38.112
42.862
RSS
110.22
109.69
109.92
110.16
126.03
133.17
AIC
26.584
28.165
28.345
28.534
36.246
41.038
hl
Step: AIC=26.58
wt ~ tl + hl
Df Sum of Sq
<none>
+ ag
+ fl
+ ae
- hl
- tl
1
1
1
1
1
0.5303
0.3024
0.0641
15.8111
22.9480
Call:
lm(formula = wt ~ tl + hl)
Coefficients:
(Intercept)
tl
-17.0972
0.1755
hl
20.0488
pairs(formula.all)
6
birds <read.table("http://users.humboldt.edu//rizzardi//Data.dir/bird.txt",
header=T, skip=15)
attach( birds )
# Use age class (age, adult=1, juvenile=2), total length (tl), wing span
(ae),
# femur length (thigh bone, fl), and humerus (arm bone, hl) to predict
weight (wt)
fit.all <- lm( wt ~ ae + ag + fl + hl + tl )
summary( fit.all )
anova(fit.all)
# can define a formula object
formula.all <- formula(wt~ae+ag+fl+hl+tl)
formula.all
lm(formula.all) # same as fit.all
drop1(fit.all) # uses AIC statistic as criteria
# AIC = n*log(RSS/n)+2p, where p=number of parameters
# Goal: minimize AIC
# confirming AIC for fit.all
87 * log( sum(residuals(fit.all)^2) / 87) + 2*6
extractAIC(fit.all) # same thing
drop1(fit.all,test="F") # uses so called "type II" ANOVA - decrease in RSS.
step( fit.all, direction="backward" ) # repeated uses drop1()
step( fit.all, direction="backward", test="F", trace=F ) # selects same
model
# can define a formula object
formula.all <- formula(wt~ae+ag+fl+hl+tl)
formula.all
lm(formula.all) # same as fit.all
fit.null <- lm( wt ~ 1 )
add1( fit.null, scope=formula.all ) # formula.all sets range of choices
# "forward" direction repeated uses add1()
step( fit.null, direction="forward", scope=formula.all ) # same model again
# Use best of "forward" and "backward"
fit.middle <- lm(wt~ ae+ag+tl)
step( fit.middle, direction="both", scope=formula.all ) # same model again
pairs(formula.all)
7