Chapter 4 Multiple Linear Regression
4.1 Exploring Data With Multiple Predictor Variables
Pairs Plot (Scatterplot Matrix)
To see how any pairs of variable in a data are related use pairs(data)
. For an example with carSales
data from Section 3.6,
#can make it symmetric on diagonal by leaving as `pairs(carSales)`
pairs(carSales, lower.panel = NULL)
The upper panel here can be customised by making a new function
Example from Section 3.6
# Customize upper panel
<- function(x, y){
upper.panel points(x,y, pch=19, col=4)
<- round(cor(x, y), digits=3)
r <- paste0("r = ", r)
txt <- par("usr"); on.exit(par(usr))
usr par(usr = c(0, 1, 0, 1))
text(0.5, 0.9, txt)
}pairs(carSales, lower.panel = NULL,
upper.panel = upper.panel)
Example from Practical 2
<- function(x, y){
panel.cor <- par("usr"); on.exit(par(usr))
usr par(usr = c(0, 1, 0, 1))
<- round(cor(x, y), digits=2)
r <- paste0(" ", r)
txt text(0.5, 0.5, txt, cex = 0.8)
}# Customize upper panel
<-function(x, y){
upper.panelpoints(x,y, pch = 19)
}# Create the plots
pairs(fat1,
lower.panel = panel.cor,
upper.panel = upper.panel)
Using corrplot
Package
library(corrplot)
Examples from Practical 2
corrplot(cor(fat1), method = "number", type = "upper", diag = FALSE)
corrplot.mixed(cor(fat1), upper = "ellipse", lower = "number",number.cex = .7)
Plots To Show Isses Due To Multicollinearity and \(p\) close to \(n\)
To see how to plot how ridge regression lowers variance when we have the problem of multicollinearity and \(p\) close to \(n\) see notes section 6.2 in notes.
4.2 Basics of Building an MLR Model (Including Transformations)
Build a Model With Specific Predictors
<- lm(Y ~ X1 + X2 + ... + Xn, data=data) model
Build a Model With All Predictors
<- lm(Y ~ ., data=data) model
Build a Model With Specific Predictors Removed
<- lm(Y ~ . - X1 - X2, data=data) model
Example Including Transformations
<- lm(Y ~ X1 + X2*X3 + log(X4) + I(X5^2), data=data) model
Note that X2*X3
includes the predictors X2
, X3
and their product X2 x X3.
Inference is much the same as with SLR.
4.3 Multicollinearity Detection and VIF
To calculate VIF value for each predictor in a model simply use vif()
function from car library
.
library(car)
vif(reg)
## Age Miles
## 3.907129 3.907129
4.4 Model Selection
The following subsection follows closely to Practical Demonstration 5.4. Most examples will be from there.
Best Subset and Forward/Backward Stepwise Selection (Using Selection Criteria)
Best Subset Selection is done using regsubsets()
function from leaps
library.
library(leaps)
#function for best subset selection
= regsubsets(Salary ~ ., Hitters, nvmax=19)
best = summary(best)
results
names(results) #gives the names of the predictors chosen by best subset selection
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
nvmax=19
specifies that you want all 19 models and not just up to the best one.
Data you can extract from output of regsubsets()
= results$rss
RSS = results$rsq
r2 = results$cp
Cp = results$bic
BIC = results$adjr2
Adj_r2
cbind(RSS, r2, Cp, BIC, Adj_r2)
## RSS r2 Cp BIC Adj_r2
## [1,] 36179679 0.3214501 104.281319 -90.84637 0.3188503
## [2,] 30646560 0.4252237 50.723090 -128.92622 0.4208024
## [3,] 29249297 0.4514294 38.693127 -135.62693 0.4450753
## [4,] 27970852 0.4754067 27.856220 -141.80892 0.4672734
## [5,] 27149899 0.4908036 21.613011 -144.07143 0.4808971
## [6,] 26194904 0.5087146 14.023870 -147.91690 0.4972001
## [7,] 25906548 0.5141227 13.128474 -145.25594 0.5007849
## [8,] 25136930 0.5285569 7.400719 -147.61525 0.5137083
## [9,] 24814051 0.5346124 6.158685 -145.44316 0.5180572
## [10,] 24500402 0.5404950 5.009317 -143.21651 0.5222606
## [11,] 24387345 0.5426153 5.874113 -138.86077 0.5225706
## [12,] 24333232 0.5436302 7.330766 -133.87283 0.5217245
## [13,] 24289148 0.5444570 8.888112 -128.77759 0.5206736
## [14,] 24248660 0.5452164 10.481576 -123.64420 0.5195431
## [15,] 24235177 0.5454692 12.346193 -118.21832 0.5178661
## [16,] 24219377 0.5457656 14.187546 -112.81768 0.5162219
## [17,] 24209447 0.5459518 16.087831 -107.35339 0.5144464
## [18,] 24201837 0.5460945 18.011425 -101.86391 0.5126097
## [19,] 24200700 0.5461159 20.000000 -96.30412 0.5106270
Plotting To Understand Best Subset Selection
#Plots RSS and R-square as you add more predictors
par(mfrow = c(1, 2))
plot(RSS, xlab = "Number of Predictors", ylab = "RSS",
type = "l", lwd = 2)
plot(r2, xlab = "Number of Predictors", ylab = "R-square",
type = "l", lwd = 2)
Number of predictors in optimal model under different selection criteria
which.min(Cp)
## [1] 10
which.min(BIC)
## [1] 6
which.max(Adj_r2)
## [1] 11
Example Plot For How Selection Criteria Change
par(mfrow = c(1, 3))
plot(Cp, xlab = "Number of Predictors", ylab = "Cp",
type = 'l', lwd = 2)
points(10, Cp[10], col = "red", cex = 2, pch = 8, lwd = 2)
plot(BIC, xlab = "Number of Predictors", ylab = "BIC",
type = 'l', lwd = 2)
points(6, BIC[6], col = "red", cex = 2, pch = 8, lwd = 2)
plot(Adj_r2, xlab = "Number of Predictors", ylab = "Adjusted RSq",
type = "l", lwd = 2)
points(11, Adj_r2[11], col = "red", cex = 2, pch = 8, lwd = 2)
Built-in Plot for Criteria
Top row is the best model under the criteria and worst is at bottom
plot(best, scale = "Cp")
The options for scale
are "bic"
, "Cp"
, "adjr2"
and "r2"
.
Extracting Coefficients for Best Model (at a specific number of predictors)
coef(best, 10) #Cp
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
The number here which is k=10
corresponds to model M_k
. We found this from which.min(Cp)
.
Best Subset Selection (Using Validation)
‘regsubsets()’ does not have a built-in function to do best subsey selection using validation so we use a custom function (copy and paste).
= function(object, newdata, id, ...){
predict.regsubsets = as.formula(object$call[[2]])
form = model.matrix(form, newdata)
mat = coef(object, id = id)
coefi = names(coefi)
xvars %*%coefi
mat[, xvars] }
object
should be the result of a call toregsubsets
newdata
should be data frame with data from validation set inid
specifies we want to use the model fromobject
withid
predictors
Splitting Data into Training and Validation
As a rough guide, split the data approximately 2:1 for Training:Validation.
dim(Hitters)
= sample(1:263, 175)
training.obs = Hitters[training.obs, ]
Hitters.train = Hitters[-training.obs, ] Hitters.test
Best Subset Validation
The regsubsets
call:
= regsubsets(Salary ~ ., data = Hitters.train, nvmax = 19) best.val
Iterate through every number of predictors and calculate the MSE between prediction and observed (from validation set).
<- c()
val.error for(i in 1:19){
= predict.regsubsets(best.val, Hitters.test, i)
pred = mean((Hitters.test$Salary - pred)^2) #MSE
val.error[i]
}
#stores 19 models validation MSE val.error
## [1] 97930.78 73693.58 105273.10 112938.68 81937.07 77800.93 77762.84
## [8] 85164.27 81852.33 81834.60 83726.73 84971.76 85650.07 84864.01
## [15] 85721.48 86244.82 86266.13 86340.15 86319.27
Index with smallest MSE is best number of predictors
which.min(val.error)
## [1] 2
Inference
After choosing optimal model with best subset selection using a validation set, before inference, you must train the model in the entire data set.
coef(best.val, 10) # Check which variables to use in the lm.
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 202.3561532 -2.4493146 7.1926151 6.5117486 -0.2543722 2.2119009
## CRBI CWalks DivisionW PutOuts Assists
## 0.9966418 -0.9576867 -132.1967526 0.3656113 0.3545804
= lm(Salary ~ AtBat + Hits + Runs + Walks + CAtBat + CRuns + CRBI + CWalks + Division + PutOuts, data = Hitters) ls10
Usual inference can follow.
Note: using a validation set means the choice of the best model is random, depending on how you selected which observations were used for training and which were used for validation.
Best Subset Selection (Using Cross-Validation)
Doing cross-validation is generally preferable to regular validation as it utilises entire data set for training and testing.
Continuing with Hitters
data from Example 5.4,
= regsubsets(Salary ~ ., data = Hitters, nvmax = 19) best
This method requires manually specifying the models you include in the cross-validation. So, for this example, we use cross-validation to compare the three models suggested by \(C_p\), BIC and adjusted \(R^2\). (Recall: could use coef(best,10)
to find coefficients in \(C_p\) model since \(C_p\) found 10 predictors was best).
Train Models on Entire Data Set
= lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
ls10 + PutOuts + Assists, data = Hitters)
Division = lm(Salary ~ AtBat + Hits + Walks + CRBI + Division + PutOuts,
ls6 data = Hitters)
= lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
ls11 + League + Division + PutOuts + Assists, data = Hitters)
Creating The Folds
Using \(k = 10\) folds,
= 10
k = cut(1:263, breaks=10, labels=FALSE)
folds
table(folds)
## folds
## 1 2 3 4 5 6 7 8 9 10
## 27 26 26 26 27 26 26 26 26 27
However, currently the folds are not random, the first fold is just the first 27 entries. We can randomly shuffle the folds using sample
.
set.seed(2)
= sample(folds)
folds folds
## [1] 8 10 8 7 3 5 3 6 9 2 3 4 5 9 7 2 10 5 7 6 2 10 8 6 4
## [26] 1 5 8 7 5 10 4 5 10 8 8 1 5 9 2 2 10 8 10 1 1 7 9 2 6
## [51] 5 8 3 4 8 2 1 6 8 10 5 7 2 3 7 7 1 1 7 3 9 8 4 3 6
## [76] 7 5 5 10 2 5 6 2 1 4 5 9 1 3 3 8 3 10 8 6 1 2 6 1 4
## [101] 9 6 4 10 9 1 9 7 9 8 4 8 6 1 10 2 10 10 7 9 3 9 4 4 5
## [126] 6 10 3 9 2 8 8 6 2 2 6 1 9 1 10 7 4 2 4 8 5 8 1 6 3
## [151] 1 10 10 1 3 3 3 7 3 4 2 9 4 6 8 9 7 2 1 7 4 3 5 7 8
## [176] 4 9 7 9 5 2 1 1 6 4 4 5 10 6 5 1 10 9 7 3 1 5 2 7 7
## [201] 4 6 7 10 3 6 5 10 4 9 9 5 7 7 2 6 9 5 5 3 2 3 5 5 9
## [226] 3 6 1 1 9 8 6 8 9 10 4 4 10 10 5 2 3 3 6 3 4 10 9 6 2
## [251] 1 8 6 7 8 4 2 8 1 10 2 7 4
Calculating The CV Errors
We first create a matrix to store the CV errors,
= matrix(NA, nrow = k, ncol = 3,
cv.errors dimnames = list(NULL, c("ls10", "ls6", "ls11")))
cv.errors
## ls10 ls6 ls11
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] NA NA NA
## [4,] NA NA NA
## [5,] NA NA NA
## [6,] NA NA NA
## [7,] NA NA NA
## [8,] NA NA NA
## [9,] NA NA NA
## [10,] NA NA NA
Then, iterating through each fold, we obtain the CV errors,
for(i in 1:k){
#Trains models on all folds but i
= lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
ls10 + PutOuts + Assists, data = Hitters[folds!=i, ] )
Division = lm(Salary ~ AtBat + Hits + Walks + CRBI + Division + PutOuts,
ls6 data = Hitters[folds!=i, ])
= lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + CWalks +
ls11 + League + Division + PutOuts + Assists,
data = Hitters[folds!=i, ])
#Predicts values in fold i
<- predict( ls10, newdata = Hitters[folds==i, ] )
pred10 <- predict( ls6, newdata = Hitters[folds==i, ] )
pred6 <- predict( ls11, newdata = Hitters[folds==i, ] )
pred11
#Calculates CV errors
= c( mean( (Hitters$Salary[folds==i]-pred10)^2), #calculate MSE for cv
cv.errors[i,] mean( (Hitters$Salary[folds==i]-pred6)^2),
mean( (Hitters$Salary[folds==i]-pred11)^2) )
} cv.errors
## ls10 ls6 ls11
## [1,] 132495.25 127446.51 130581.86
## [2,] 111992.83 110608.25 109426.81
## [3,] 87822.97 98247.02 87132.03
## [4,] 113161.24 98812.43 112384.13
## [5,] 225490.54 231835.72 227007.73
## [6,] 83622.26 46935.72 85460.28
## [7,] 90430.09 95275.05 91963.45
## [8,] 100105.45 106254.47 98519.80
## [9,] 97701.55 97551.93 104063.92
## [10,] 48005.26 63602.55 46157.69
Calculating the means gives a performance metric for each model.
<- colMeans(cv.errors)
cv.mean.errors cv.mean.errors
## ls10 ls6 ls11
## 109082.7 107657.0 109269.8