Homework on Identification of Influential Observations

library(dplyr)

load data

dataC07 <- read.table("./data/APPENC07.txt", col.names = c('ID',  
                  'SalesPrice','FinishedSquareFeet','NofBedrooms','NofBathrooms',
                  'AirConditioning','GarageSize','Pool','YearBuilt','Quality',
                   'Style','LotSize','AdjacenttoHighway'))

fit model

fit0 = lm(SalesPrice~.-Quality-Style, data=dataC07)
par(mfrow=c(2,2))
plot(fit0)

Find the index of influential observations which meet the following criteria: RSTUDENT > 4 or Cook’s D > 0.12 or CovRatio < 0.8

cd<-which(cooks.distance(fit0)>0.12)
covr<-which(covratio(fit0)< 0.8)
rst<-which(rstudent(fit0)>4)
index<-sort(unique(c(cd,covr,rst)))
index
## [1] 72 73 79 80 96

bulid table

DataTable <- cbind(index,rstudent(fit0)[index],
                   cooks.distance(fit0)[index],
                   covratio(fit0)[index])
colnames(DataTable) = c("ID value", "RSTUDENT", "Cook’s D","CovRatio")
DataTable
##    ID value RSTUDENT   Cook’s D  CovRatio
## 72       72 4.110688 0.02317113 0.7250078
## 73       73 5.243015 0.04284879 0.5838824
## 79       79 3.663288 0.01732241 0.7790232
## 80       80 3.700403 0.01550332 0.7732033
## 96       96 4.370469 0.13361571 0.7362033
Copyright © 2017 Ming Chen & Wenqiang Feng. All rights reserved.