# Read in data and load packages
library(ISLR)
library(leaps)
library(MASS)
library(boot)
library(car)
library(glmnet)
hitters <- read.csv("hitters.csv", header = T)
# Find VIF
mod <- lm(Salary ~ ., data = hitters)
summaree <- summary(mod)
vif(mod)
## AtBat Hits HmRun Runs RBI Walks
## 22.944366 30.281255 7.758668 15.246418 11.921715 4.148712
## Years CAtBat CHits CHmRun CRuns CRBI
## 9.313280 251.561160 502.954289 46.488462 162.520810 131.965858
## CWalks League Division PutOuts Assists Errors
## 19.744105 4.134115 1.075398 1.236317 2.709341 2.214543
## NewLeague
## 4.099063
Many of the VIF scores are above 10, suggesting that multicollinearity appears to be an issue. The three most affected variables are CHits, CAtBat, and CRuns, in that order. Â
# All possible regressions
all <- regsubsets(Salary ~ ., data = hitters, nvmax = 19)
allSumm <- summary(all)
# Find the optimal number of explanatory variables
max_idx <- which.max(allSumm$adjr2)
print(paste("Optimal number of explanatory variables:", max_idx))
## [1] "Optimal number of explanatory variables: 11"
# Plot the Adjusted R^2 for each of these
{plot(allSumm$adjr2, type = "l", xlab = "Number of Explanatory Variables",
ylab = bquote("Adjusted R"^2), main = "Optimal Model Order")
points(allSumm$adjr2, pch = 16)
points(x = max_idx, y = allSumm$adjr2[max_idx], pch = 16, col = "red")}
# Find the optimum combination of 11 variables to include
allSumm$which[max_idx,]
## (Intercept) AtBat Hits HmRun Runs RBI
## TRUE TRUE TRUE FALSE FALSE FALSE
## Walks Years CAtBat CHits CHmRun CRuns
## TRUE FALSE TRUE FALSE FALSE TRUE
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## TRUE TRUE TRUE TRUE TRUE TRUE
## Errors NewLeagueN
## FALSE FALSE
modelAllOptimum <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists,
data = hitters)
sm <- lm(Salary ~ 1, data = hitters)
lg <- lm(Salary ~ ., data = hitters)
# Forward selecion
# forwardSteps <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
# direction = "forward")
forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "forward", trace = 0)
# Backward selection
# backwardSteps <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
# direction = "backward")
backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
direction = "backward", trace = 0)
# Hybrid selection
# hybridSteps <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
# direction = "both")
hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "both", trace = 0)
# Perform cross validation
n <- dim(hitters)[1]
trn <- sample(x = c(rep(TRUE, round(0.8*n)), rep(FALSE, n-round(0.8*n))),
size = n, replace = FALSE)
train <- hitters[trn,]
tst <- !trn
test <- hitters[tst,]
# Function that returns RMSE
# Takes in model and test distribution as input
getRMSE <- function(mod, test) {
set.seed(1) # set seed
pred <- predict(object = mod, newdata = test)
RMSE <- sqrt(mean((test$Salary - pred)^2))
return (RMSE)
}
# Full model
mod <- lm(Salary ~ ., data = train)
RMSEfull <- getRMSE(mod, test)
# Optimal all-possible regressions model
mod <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists,
data = train)
RMSEoptimum <- getRMSE(mod, test)
# Best stepwise-selection model
sm <- lm(Salary ~ 1, data = train)
lg <- lm(Salary ~ ., data = train)
forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "forward", trace = 0)
RMSEforward <- getRMSE(forward, test)
summary(forward)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits + PutOuts + Walks + AtBat +
## Division, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -899.01 -184.52 -15.05 161.11 1102.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.98218 74.01052 0.946 0.345492
## CRBI 0.61066 0.06810 8.967 < 2e-16 ***
## Hits 7.14081 1.70682 4.184 4.27e-05 ***
## PutOuts 0.26108 0.07594 3.438 0.000710 ***
## Walks 4.88772 1.30202 3.754 0.000227 ***
## AtBat -1.81945 0.55614 -3.272 0.001257 **
## DivisionW -105.10370 42.84764 -2.453 0.015012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 305 on 203 degrees of freedom
## Multiple R-squared: 0.5238, Adjusted R-squared: 0.5098
## F-statistic: 37.22 on 6 and 203 DF, p-value: < 2.2e-16
backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
direction = "backward", trace = 0)
RMSEbackward <- getRMSE(backward, test)
summary(backward)
##
## Call:
## lm(formula = Salary ~ AtBat + Hits + Walks + CRuns + CRBI + CWalks +
## Division + PutOuts, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -809.9 -170.7 -11.2 153.7 1110.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.6051 74.1619 1.249 0.213232
## AtBat -1.9475 0.5519 -3.529 0.000518 ***
## Hits 6.3521 1.7037 3.728 0.000250 ***
## Walks 7.6122 1.6696 4.559 8.91e-06 ***
## CRuns 0.6670 0.2609 2.557 0.011297 *
## CRBI 0.5137 0.1930 2.661 0.008416 **
## CWalks -0.7969 0.2925 -2.724 0.007016 **
## DivisionW -110.3624 42.4453 -2.600 0.010011 *
## PutOuts 0.2673 0.0757 3.531 0.000513 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 300.4 on 201 degrees of freedom
## Multiple R-squared: 0.5426, Adjusted R-squared: 0.5244
## F-statistic: 29.81 on 8 and 201 DF, p-value: < 2.2e-16
hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "both", trace = 0)
RMSEhybrid <- getRMSE(hybrid, test)
summary(hybrid)
##
## Call:
## lm(formula = Salary ~ CRBI + Hits + PutOuts + Walks + AtBat +
## Division, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -899.01 -184.52 -15.05 161.11 1102.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.98218 74.01052 0.946 0.345492
## CRBI 0.61066 0.06810 8.967 < 2e-16 ***
## Hits 7.14081 1.70682 4.184 4.27e-05 ***
## PutOuts 0.26108 0.07594 3.438 0.000710 ***
## Walks 4.88772 1.30202 3.754 0.000227 ***
## AtBat -1.81945 0.55614 -3.272 0.001257 **
## DivisionW -105.10370 42.84764 -2.453 0.015012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 305 on 203 degrees of freedom
## Multiple R-squared: 0.5238, Adjusted R-squared: 0.5098
## F-statistic: 37.22 on 6 and 203 DF, p-value: < 2.2e-16
print(paste("Full model CV RMSE:", round(RMSEfull, 3)))
## [1] "Full model CV RMSE: 373.739"
print(paste("Optimal model CV RMSE:", round(RMSEoptimum, 3)))
## [1] "Optimal model CV RMSE: 362.098"
print(paste("Forward selction CV RMSE:", round(RMSEforward, 3)))
## [1] "Forward selction CV RMSE: 375.272"
print(paste("Backward selection CV RMSE:", round(RMSEbackward, 3)))
## [1] "Backward selection CV RMSE: 368.284"
print(paste("Hybrid selection CV RMSE:", round(RMSEhybrid, 3)))
## [1] "Hybrid selection CV RMSE: 375.272"
The optimal model appears to be the most appropriate since it has the lowest cross validation RMSE among all models fitted.
# Full model
mod <- glm(Salary ~ ., data = hitters)
set.seed(1)
RMSEfull_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])
# Optimal all-possible regressions model
mod <- glm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI +
CWalks + League + Division + PutOuts + Assists,
data = hitters)
set.seed(1)
RMSEoptimum_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])
# Best stepwise-selection model
sm <- lm(Salary ~ 1, data = hitters)
lg <- lm(Salary ~ ., data = hitters)
forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "forward", trace = 0)
mod <- glm(forward)
set.seed(1)
RMSEforward_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])
backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
direction = "backward", trace = 0)
mod <- glm(backward)
set.seed(1)
RMSEbackward_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])
hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
direction = "both", trace = 0)
mod <- glm(hybrid)
set.seed(1)
RMSEhybrid_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])
print(paste("Full model K-fold RMSE:", round(RMSEfull_K, 3)))
## [1] "Full model K-fold RMSE: 346.322"
print(paste("Optimal model K-fold RMSE:", round(RMSEoptimum_K, 3)))
## [1] "Optimal model K-fold RMSE: 332.789"
print(paste("Forward selection K-fold RMSE:", round(RMSEforward_K, 3)))
## [1] "Forward selection K-fold RMSE: 331.743"
print(paste("Backward selection K-fold RMSE:", round(RMSEbackward_K, 3)))
## [1] "Backward selection K-fold RMSE: 331.743"
print(paste("Hybrid selection K-fold RMSE:", round(RMSEhybrid_K, 3)))
## [1] "Hybrid selection K-fold RMSE: 331.743"
Overall, hybrid stepwise-selection seems to be the model of choice. In this case, the variables in the stepwise-selection methods all seem to be the same, but this may not always be the case when multicolliearity is present. If multicollinearity was present among the variables of the model, the hybrid approach would output a considerably more accurate model, since it allows for the exit of variables. Hybrid stepwise-selection also gives a good predictive accuracy relative to the other models using k-fold cross validation. It does not result in the lowest RMSE, but is relatively close to that of the optimal model, and considerably lower than that of the other stepwise-selection methods. Time-wise, hybrid stepwise-selection out performs the optimal model, since it requires fewer computational steps, though the optimal model seems to output a model that is comparable in predictive accuracy, if not better.
## Shrinkage/Regularization Methods
## Ridge and LASSO
X <- model.matrix(Salary ~ ., hitters)[,-1]
y <- hitters$Salary
grid <- 10^seq(10, -3, length=1000) # Set grid
# RIDGE
ridge.mod <- glmnet(X, y, alpha=0, lambda=grid)
plot(ridge.mod, xvar = "lambda", label = TRUE) # Plot param ests against lambda
## LASSO
lasso.mod <- glmnet(X, y, alpha=1, lambda=grid)
plot(lasso.mod, xvar = "lambda", label = TRUE) # Plot param ests against lambda
set.seed(1)
train <- sample(1:nrow(X), 210)
test <- (-train)
y.test <- y[test]
# RIDGE
cv.out <- cv.glmnet(X[train,], y[train], alpha=0) # Find best lambda
plot(cv.out)
bestlam.r <- cv.out$lambda.min
print(paste("Optimal lambda:", round(bestlam.r,3)))
## [1] "Optimal lambda: 36.576"
predict(ridge.mod, s=bestlam.r, type = "coefficients")[1:20,] # Best ridge regression model
## (Intercept) AtBat Hits HmRun Runs
## 6.241428e+01 -4.930832e-01 2.296882e+00 -1.385474e+00 1.110477e+00
## RBI Walks Years CAtBat CHits
## 7.755174e-01 2.999884e+00 -7.563493e+00 3.691778e-03 1.198356e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.621278e-01 2.470793e-01 2.338690e-01 -2.058511e-01 4.961218e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.208793e+02 2.572746e-01 1.425053e-01 -3.501714e+00 -1.372009e+01
# LASSO
cv.out <- cv.glmnet(X[train,], y[train], alpha=1) # Find best lambda
plot(cv.out)
bestlam.l <- cv.out$lambda.min
print(paste("Optimal lambda:", round(bestlam.l,3)))
## [1] "Optimal lambda: 2.641"
predict(lasso.mod, s=bestlam.l, type = "coefficients")[1:20,] # Best LASSO regression model
## (Intercept) AtBat Hits HmRun Runs
## 124.1430078 -1.5557245 5.6830743 0.0000000 0.0000000
## RBI Walks Years CAtBat CHits
## 0.0000000 4.7375354 -9.5924702 0.0000000 0.0000000
## CHmRun CRuns CRBI CWalks LeagueN
## 0.5229904 0.6637874 0.3875519 -0.5318693 32.2219192
## DivisionW PutOuts Assists Errors NewLeagueN
## -119.2561455 0.2727829 0.1747881 -2.0717182 0.0000000
In the ridge model, the explanatory variables have been minimised where insignificant variables are close to 0, but are not zero, and the significant variables are small. This is contrasted with the LASSO model where insignificant variables are zero, and the significant variables are large, relative to the significant values of the ridge model. Both models are similar in identifying which variables are significant and which are not.
trn <- hitters[train,]
tst <- hitters[-train,] # Held-out test set
stepwise <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns +
CRBI + CWalks + Division + PutOuts + Assists, data = trn)
step.pred <- predict(stepwise, tst)
RMSEs <- sqrt(mean((step.pred - y.test)^2)) # Predictive RMSE of best stepwise model
ridge.pred <- predict(ridge.mod, s=bestlam.r, newx=X[test,])
RMSEr <- sqrt(mean((ridge.pred - y.test)^2)) # Predictive RMSE of best ridge model
lasso.pred <- predict(lasso.mod, s=bestlam.l, newx=X[test,])
RMSEl <- sqrt(mean((lasso.pred - y.test)^2)) # Predictive RMSE of best LASSO model
print(paste("RMSE (Stepwise):", round(RMSEs,3)))
## [1] "RMSE (Stepwise): 245.978"
print(paste("RMSE (LASSO):", round(RMSEl,3)))
## [1] "RMSE (LASSO): 232.878"
print(paste("RMSE (Ridge):", round(RMSEr,3)))
## [1] "RMSE (Ridge): 245.276"
The RMSE of the best LASSO regression is lowest, followed by the best ridge regression, and then Best-Stepwise selection model. Based on this criterion, the LASSO model is the best.