Homework on Variable Selection

library("R330")
library(MASS)
library(leaps)

load data

dataC07 <- read.table("./data/APPENC07.txt", col.names = c('ID',  'SalesPrice','FinishedSquareFeet','NofBedrooms','NofBathrooms',
'AirConditioning','GarageSize','Pool','YearBuilt',
'Quality','Style','LotSize','AdjacenttoHighway'))

1. Use All Possible Regressions to select a best model

  • run allpossregs()
allPossRegsSelect = allpossregs(SalesPrice~.-Quality-Style, data=dataC07)

allPossRegsSelect
##            rssp     sigma2 adjRsq      Cp     AIC     BIC           CV ID
## 1  3.255426e+12 6260433856  0.671 173.759 695.759 704.275 328548061695  0
## 2  2.792410e+12 5380365830  0.717  77.371 599.371 612.144 283659722282  0
## 3  2.597213e+12 5013925571  0.736  37.893 559.893 576.924 266477790722  0
## 4  2.522941e+12 4879963900  0.743  24.111 546.111 567.399 259722391665  0
## 5  2.477092e+12 4800565511  0.748  16.368 538.368 563.914 255765659965  1
## 6  2.436421e+12 4730913835  0.751   9.725 531.725 561.529 253281426181  1
## 7  2.414708e+12 4697875609  0.753   7.112 529.112 563.173 251645880447  1
## 8  2.409249e+12 4696392549  0.753   7.952 529.952 568.271 252737701207  1
## 9  2.404789e+12 4696853101  0.753   9.004 531.004 573.581 253796351300  1
## 10 2.404771e+12 4706008830  0.753  11.000 533.000 579.834 254700238543  1
##    FinishedSquareFeet NofBedrooms NofBathrooms AirConditioning GarageSize
## 1                   1           0            0               0          0
## 2                   1           0            0               0          0
## 3                   1           0            0               0          0
## 4                   1           0            0               0          1
## 5                   1           0            0               0          1
## 6                   1           1            0               0          1
## 7                   1           1            0               0          1
## 8                   1           1            0               0          1
## 9                   1           1            1               0          1
## 10                  1           1            1               1          1
##    Pool YearBuilt LotSize AdjacenttoHighway
## 1     0         0       0                 0
## 2     0         1       0                 0
## 3     0         1       1                 0
## 4     0         1       1                 0
## 5     0         1       1                 0
## 6     0         1       1                 0
## 7     0         1       1                 1
## 8     1         1       1                 1
## 9     1         1       1                 1
## 10    1         1       1                 1
  • Select the best model: The best model has the smallest Cp. Here it is model 9. The variables that should be included in the model:
minCp = which.min(allPossRegsSelect[,"Cp"])
minCp
## 7 
## 7
vars = which(allPossRegsSelect[,-c(1:8)][minCp,] != 0)
vars 
## FinishedSquareFeet        NofBedrooms         GarageSize 
##                  1                  2                  5 
##          YearBuilt            LotSize  AdjacenttoHighway 
##                  7                  8                  9
  • Obtain the model parameters
allPossRegsFit = lm(SalesPrice ~ .-NofBathrooms-Pool , data = dataC07)
allPossRegsFit
## 
## Call:
## lm(formula = SalesPrice ~ . - NofBathrooms - Pool, data = dataC07)
## 
## Coefficients:
##        (Intercept)                  ID  FinishedSquareFeet  
##         -2.414e+06          -2.781e+01           1.314e+02  
##        NofBedrooms     AirConditioning          GarageSize  
##         -7.925e+03          -1.227e+04           1.402e+04  
##          YearBuilt             Quality               Style  
##          1.279e+03          -4.618e+04          -9.457e+03  
##            LotSize   AdjacenttoHighway  
##          1.170e+00          -4.526e+04

2. Backward elimination

fit0 = lm(SalesPrice~.-Quality-Style, data=dataC07)
BEfit = stepAIC(fit0, direction="backward")
## Start:  AIC=11636.92
## SalesPrice ~ (ID + FinishedSquareFeet + NofBedrooms + NofBathrooms + 
##     AirConditioning + GarageSize + Pool + YearBuilt + Quality + 
##     Style + LotSize + AdjacenttoHighway) - Quality - Style
## 
##                      Df  Sum of Sq        RSS   AIC
## - AirConditioning     1 1.8275e+07 2.4048e+12 11635
## - NofBathrooms        1 4.4789e+09 2.4092e+12 11636
## - Pool                1 4.5538e+09 2.4093e+12 11636
## <none>                             2.4048e+12 11637
## - AdjacenttoHighway   1 1.9403e+10 2.4242e+12 11639
## - NofBedrooms         1 4.5459e+10 2.4502e+12 11645
## - ID                  1 4.6623e+10 2.4514e+12 11645
## - GarageSize          1 6.2207e+10 2.4670e+12 11648
## - LotSize             1 1.4513e+11 2.5499e+12 11666
## - YearBuilt           1 2.7980e+11 2.6846e+12 11692
## - FinishedSquareFeet  1 1.2973e+12 3.7020e+12 11860
## 
## Step:  AIC=11634.93
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + NofBathrooms + 
##     GarageSize + Pool + YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## - NofBathrooms        1 4.4606e+09 2.4092e+12 11634
## - Pool                1 4.5367e+09 2.4093e+12 11634
## <none>                             2.4048e+12 11635
## - AdjacenttoHighway   1 1.9385e+10 2.4242e+12 11637
## - NofBedrooms         1 4.5899e+10 2.4507e+12 11643
## - ID                  1 4.6636e+10 2.4514e+12 11643
## - GarageSize          1 6.3062e+10 2.4679e+12 11646
## - LotSize             1 1.4755e+11 2.5523e+12 11664
## - YearBuilt           1 2.9812e+11 2.7029e+12 11694
## - FinishedSquareFeet  1 1.2994e+12 3.7042e+12 11858
## 
## Step:  AIC=11633.89
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + GarageSize + 
##     Pool + YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## - Pool                1 5.4587e+09 2.4147e+12 11633
## <none>                             2.4092e+12 11634
## - AdjacenttoHighway   1 2.1033e+10 2.4303e+12 11636
## - NofBedrooms         1 4.1439e+10 2.4507e+12 11641
## - ID                  1 5.2921e+10 2.4622e+12 11643
## - GarageSize          1 6.4549e+10 2.4738e+12 11646
## - LotSize             1 1.5279e+11 2.5620e+12 11664
## - YearBuilt           1 3.4322e+11 2.7525e+12 11701
## - FinishedSquareFeet  1 1.7064e+12 4.1157e+12 11911
## 
## Step:  AIC=11633.07
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + GarageSize + 
##     YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## <none>                             2.4147e+12 11633
## - AdjacenttoHighway   1 2.1713e+10 2.4364e+12 11636
## - NofBedrooms         1 3.9811e+10 2.4545e+12 11640
## - ID                  1 5.4061e+10 2.4688e+12 11643
## - GarageSize          1 6.6157e+10 2.4809e+12 11645
## - LotSize             1 1.4926e+11 2.5640e+12 11662
## - YearBuilt           1 3.4008e+11 2.7548e+12 11700
## - FinishedSquareFeet  1 1.7299e+12 4.1446e+12 11913
coef(BEfit)
##        (Intercept)                 ID FinishedSquareFeet 
##      -3.509350e+06      -8.615438e+01       1.227634e+02 
##        NofBedrooms         GarageSize          YearBuilt 
##      -1.042371e+04       2.164728e+04       1.772672e+03 
##            LotSize  AdjacenttoHighway 
##       1.530331e+00      -4.684006e+04

3. Forward selection

fit0 = lm(SalesPrice~1, data=dataC07)
FSfit = stepAIC(fit0, scope=SalesPrice~FinishedSquareFeet+NofBedrooms+NofBathrooms+
                AirConditioning+GarageSize+Pool+YearBuilt+
                LotSize+AdjacenttoHighway, direction="forward")
## Start:  AIC=12356.17
## SalesPrice ~ 1
## 
##                      Df  Sum of Sq        RSS   AIC
## + FinishedSquareFeet  1 6.6555e+12 3.2554e+12 11777
## + NofBathrooms        1 4.6326e+12 5.2783e+12 12029
## + GarageSize          1 3.3086e+12 6.6023e+12 12146
## + YearBuilt           1 3.0585e+12 6.8524e+12 12166
## + NofBedrooms         1 1.6931e+12 8.2178e+12 12260
## + AirConditioning     1 8.2546e+11 9.0855e+12 12313
## + LotSize             1 4.9804e+11 9.4129e+12 12331
## + Pool                1 2.1303e+11 9.6979e+12 12347
## <none>                             9.9109e+12 12356
## + AdjacenttoHighway   1 2.5746e+10 9.8852e+12 12357
## 
## Step:  AIC=11777.02
## SalesPrice ~ FinishedSquareFeet
## 
##                     Df  Sum of Sq        RSS   AIC
## + YearBuilt          1 4.6302e+11 2.7924e+12 11699
## + GarageSize         1 2.7313e+11 2.9823e+12 11733
## + NofBathrooms       1 9.6767e+10 3.1587e+12 11763
## + LotSize            1 9.1880e+10 3.1635e+12 11764
## + AirConditioning    1 5.0865e+10 3.2046e+12 11771
## + NofBedrooms        1 2.7613e+10 3.2278e+12 11775
## <none>                            3.2554e+12 11777
## + Pool               1 1.8642e+09 3.2536e+12 11779
## + AdjacenttoHighway  1 1.6494e+07 3.2554e+12 11779
## 
## Step:  AIC=11698.93
## SalesPrice ~ FinishedSquareFeet + YearBuilt
## 
##                     Df  Sum of Sq        RSS   AIC
## + LotSize            1 1.9520e+11 2.5972e+12 11663
## + GarageSize         1 1.1221e+11 2.6802e+12 11680
## + NofBedrooms        1 3.4925e+10 2.7575e+12 11694
## + NofBathrooms       1 1.1763e+10 2.7806e+12 11699
## <none>                            2.7924e+12 11699
## + Pool               1 2.7315e+09 2.7897e+12 11700
## + AdjacenttoHighway  1 1.9392e+09 2.7905e+12 11701
## + AirConditioning    1 3.0690e+08 2.7921e+12 11701
## 
## Step:  AIC=11663.11
## SalesPrice ~ FinishedSquareFeet + YearBuilt + LotSize
## 
##                     Df  Sum of Sq        RSS   AIC
## + GarageSize         1 7.4272e+10 2.5229e+12 11650
## + NofBedrooms        1 4.4544e+10 2.5527e+12 11656
## <none>                            2.5972e+12 11663
## + AdjacenttoHighway  1 8.0710e+09 2.5891e+12 11664
## + Pool               1 6.7983e+09 2.5904e+12 11664
## + NofBathrooms       1 3.6792e+09 2.5935e+12 11664
## + AirConditioning    1 5.9687e+08 2.5966e+12 11665
## 
## Step:  AIC=11649.96
## SalesPrice ~ FinishedSquareFeet + YearBuilt + LotSize + GarageSize
## 
##                     Df  Sum of Sq        RSS   AIC
## + NofBedrooms        1 4.5839e+10 2.4771e+12 11642
## <none>                            2.5229e+12 11650
## + AdjacenttoHighway  1 8.2844e+09 2.5147e+12 11650
## + Pool               1 5.0226e+09 2.5179e+12 11651
## + NofBathrooms       1 2.0460e+09 2.5209e+12 11652
## + AirConditioning    1 1.9117e+08 2.5228e+12 11652
## 
## Step:  AIC=11642.39
## SalesPrice ~ FinishedSquareFeet + YearBuilt + LotSize + GarageSize + 
##     NofBedrooms
## 
##                     Df  Sum of Sq        RSS   AIC
## + NofBathrooms       1 1.3145e+10 2.4640e+12 11642
## <none>                            2.4771e+12 11642
## + AdjacenttoHighway  1 8.3328e+09 2.4688e+12 11643
## + Pool               1 6.9446e+09 2.4702e+12 11643
## + AirConditioning    1 8.6472e+07 2.4770e+12 11644
## 
## Step:  AIC=11641.61
## SalesPrice ~ FinishedSquareFeet + YearBuilt + LotSize + GarageSize + 
##     NofBedrooms + NofBathrooms
## 
##                     Df  Sum of Sq        RSS   AIC
## <none>                            2.4640e+12 11642
## + AdjacenttoHighway  1 7518607679 2.4564e+12 11642
## + Pool               1 5252807794 2.4587e+12 11642
## + AirConditioning    1    1136218 2.4640e+12 11644
coef(FSfit)
##        (Intercept) FinishedSquareFeet          YearBuilt 
##      -3.567709e+06       1.257386e+02       1.779611e+03 
##            LotSize         GarageSize        NofBedrooms 
##       1.554990e+00       2.253038e+04      -1.304139e+04 
##       NofBathrooms 
##       7.987552e+03

4. Stepwise

fit0 = lm(SalesPrice~.-Quality-Style, data=dataC07)
SWfit = stepAIC(fit0, direction="both")
## Start:  AIC=11636.92
## SalesPrice ~ (ID + FinishedSquareFeet + NofBedrooms + NofBathrooms + 
##     AirConditioning + GarageSize + Pool + YearBuilt + Quality + 
##     Style + LotSize + AdjacenttoHighway) - Quality - Style
## 
##                      Df  Sum of Sq        RSS   AIC
## - AirConditioning     1 1.8275e+07 2.4048e+12 11635
## - NofBathrooms        1 4.4789e+09 2.4092e+12 11636
## - Pool                1 4.5538e+09 2.4093e+12 11636
## <none>                             2.4048e+12 11637
## - AdjacenttoHighway   1 1.9403e+10 2.4242e+12 11639
## - NofBedrooms         1 4.5459e+10 2.4502e+12 11645
## - ID                  1 4.6623e+10 2.4514e+12 11645
## - GarageSize          1 6.2207e+10 2.4670e+12 11648
## - LotSize             1 1.4513e+11 2.5499e+12 11666
## - YearBuilt           1 2.7980e+11 2.6846e+12 11692
## - FinishedSquareFeet  1 1.2973e+12 3.7020e+12 11860
## 
## Step:  AIC=11634.93
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + NofBathrooms + 
##     GarageSize + Pool + YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## - NofBathrooms        1 4.4606e+09 2.4092e+12 11634
## - Pool                1 4.5367e+09 2.4093e+12 11634
## <none>                             2.4048e+12 11635
## + AirConditioning     1 1.8275e+07 2.4048e+12 11637
## - AdjacenttoHighway   1 1.9385e+10 2.4242e+12 11637
## - NofBedrooms         1 4.5899e+10 2.4507e+12 11643
## - ID                  1 4.6636e+10 2.4514e+12 11643
## - GarageSize          1 6.3062e+10 2.4679e+12 11646
## - LotSize             1 1.4755e+11 2.5523e+12 11664
## - YearBuilt           1 2.9812e+11 2.7029e+12 11694
## - FinishedSquareFeet  1 1.2994e+12 3.7042e+12 11858
## 
## Step:  AIC=11633.89
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + GarageSize + 
##     Pool + YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## - Pool                1 5.4587e+09 2.4147e+12 11633
## <none>                             2.4092e+12 11634
## + NofBathrooms        1 4.4606e+09 2.4048e+12 11635
## + AirConditioning     1 1.3929e+04 2.4092e+12 11636
## - AdjacenttoHighway   1 2.1033e+10 2.4303e+12 11636
## - NofBedrooms         1 4.1439e+10 2.4507e+12 11641
## - ID                  1 5.2921e+10 2.4622e+12 11643
## - GarageSize          1 6.4549e+10 2.4738e+12 11646
## - LotSize             1 1.5279e+11 2.5620e+12 11664
## - YearBuilt           1 3.4322e+11 2.7525e+12 11701
## - FinishedSquareFeet  1 1.7064e+12 4.1157e+12 11911
## 
## Step:  AIC=11633.07
## SalesPrice ~ ID + FinishedSquareFeet + NofBedrooms + GarageSize + 
##     YearBuilt + LotSize + AdjacenttoHighway
## 
##                      Df  Sum of Sq        RSS   AIC
## <none>                             2.4147e+12 11633
## + Pool                1 5.4587e+09 2.4092e+12 11634
## + NofBathrooms        1 5.3825e+09 2.4093e+12 11634
## + AirConditioning     1 1.6340e+07 2.4147e+12 11635
## - AdjacenttoHighway   1 2.1713e+10 2.4364e+12 11636
## - NofBedrooms         1 3.9811e+10 2.4545e+12 11640
## - ID                  1 5.4061e+10 2.4688e+12 11643
## - GarageSize          1 6.6157e+10 2.4809e+12 11645
## - LotSize             1 1.4926e+11 2.5640e+12 11662
## - YearBuilt           1 3.4008e+11 2.7548e+12 11700
## - FinishedSquareFeet  1 1.7299e+12 4.1446e+12 11913
coef(SWfit)
##        (Intercept)                 ID FinishedSquareFeet 
##      -3.509350e+06      -8.615438e+01       1.227634e+02 
##        NofBedrooms         GarageSize          YearBuilt 
##      -1.042371e+04       2.164728e+04       1.772672e+03 
##            LotSize  AdjacenttoHighway 
##       1.530331e+00      -4.684006e+04

5. Maximum R squared improvement

fit0 = lm(SalesPrice~1, data=dataC07)
maxRfit = add1(fit0, scope=SalesPrice~FinishedSquareFeet+NofBedrooms+NofBathrooms+AirConditioning+GarageSize+Pool+YearBuilt+LotSize+AdjacenttoHighway, test="F")
maxRfit
## Single term additions
## 
## Model:
## SalesPrice ~ 1
##                    Df  Sum of Sq        RSS   AIC   F value    Pr(>F)    
## <none>                           9.9109e+12 12356                        
## FinishedSquareFeet  1 6.6555e+12 3.2554e+12 11777 1063.1030 < 2.2e-16 ***
## NofBedrooms         1 1.6931e+12 8.2178e+12 12260  107.1382 < 2.2e-16 ***
## NofBathrooms        1 4.6326e+12 5.2783e+12 12029  456.3896 < 2.2e-16 ***
## AirConditioning     1 8.2546e+11 9.0855e+12 12313   47.2445 1.800e-11 ***
## GarageSize          1 3.3086e+12 6.6023e+12 12146  260.5897 < 2.2e-16 ***
## Pool                1 2.1303e+11 9.6979e+12 12347   11.4229 0.0007799 ***
## YearBuilt           1 3.0585e+12 6.8524e+12 12166  232.0956 < 2.2e-16 ***
## LotSize             1 4.9804e+11 9.4129e+12 12331   27.5134 2.274e-07 ***
## AdjacenttoHighway   1 2.5746e+10 9.8852e+12 12357    1.3544 0.2450523    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6. Using the information from the different variable selection methods, fit the best model.

bestFit = lm(SalesPrice~FinishedSquareFeet+ NofBedrooms+
               AirConditioning+GarageSize+YearBuilt+LotSize,
               data=dataC07)
coef(bestFit)
##        (Intercept) FinishedSquareFeet        NofBedrooms 
##      -3.758878e+06       1.317616e+02      -1.121647e+04 
##    AirConditioning         GarageSize          YearBuilt 
##       1.231936e+03       2.296119e+04       1.875860e+03 
##            LotSize 
##       1.599234e+00
Copyright © 2017 Ming Chen & Wenqiang Feng. All rights reserved.