Model Selection, Cross-Validation, and Regularization

# Read in data and load packages
library(ISLR)
library(leaps)
library(MASS)
library(boot)
library(car)
library(glmnet)
hitters <- read.csv("hitters.csv", header = T)

# Find VIF 
mod <- lm(Salary ~ ., data = hitters)
summaree <- summary(mod)
vif(mod)

##      AtBat       Hits      HmRun       Runs        RBI      Walks 
##  22.944366  30.281255   7.758668  15.246418  11.921715   4.148712 
##      Years     CAtBat      CHits     CHmRun      CRuns       CRBI 
##   9.313280 251.561160 502.954289  46.488462 162.520810 131.965858 
##     CWalks     League   Division    PutOuts    Assists     Errors 
##  19.744105   4.134115   1.075398   1.236317   2.709341   2.214543 
##  NewLeague 
##   4.099063

Many of the VIF scores are above 10, suggesting that multicollinearity appears to be an issue. The three most affected variables are CHits, CAtBat, and CRuns, in that order.

# All possible regressions
all <- regsubsets(Salary ~ ., data = hitters, nvmax = 19)
allSumm <- summary(all)

# Find the optimal number of explanatory variables
max_idx <- which.max(allSumm$adjr2)
print(paste("Optimal number of explanatory variables:", max_idx))

## [1] "Optimal number of explanatory variables: 11"

# Plot the Adjusted R^2 for each of these 
{plot(allSumm$adjr2, type = "l", xlab = "Number of Explanatory Variables", 
      ylab = bquote("Adjusted R"^2), main = "Optimal Model Order")
points(allSumm$adjr2, pch = 16)
points(x = max_idx, y = allSumm$adjr2[max_idx], pch = 16, col = "red")}

# Find the optimum combination of 11 variables to include
allSumm$which[max_idx,]

## (Intercept)       AtBat        Hits       HmRun        Runs         RBI 
##        TRUE        TRUE        TRUE       FALSE       FALSE       FALSE 
##       Walks       Years      CAtBat       CHits      CHmRun       CRuns 
##        TRUE       FALSE        TRUE       FALSE       FALSE        TRUE 
##        CRBI      CWalks     LeagueN   DivisionW     PutOuts     Assists 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE 
##      Errors  NewLeagueN 
##       FALSE       FALSE

modelAllOptimum <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + 
                        CWalks + League + Division + PutOuts + Assists, 
                      data = hitters)

sm <- lm(Salary ~ 1, data = hitters)
lg <- lm(Salary ~ ., data = hitters)

# Forward selecion
# forwardSteps <- stepAIC(object = sm, scope = list(upper = lg, lower = sm),
#                         direction = "forward")
forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                   direction = "forward", trace = 0)

# Backward selection
# backwardSteps <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
#                         direction = "backward")
backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm), 
                    direction = "backward", trace = 0)

# Hybrid selection
# hybridSteps <- stepAIC(object = lg, scope = list(upper = lg, lower = sm),
#                        direction = "both")
hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                  direction = "both", trace = 0)

# Perform cross validation
n <- dim(hitters)[1]
trn <- sample(x = c(rep(TRUE, round(0.8*n)), rep(FALSE, n-round(0.8*n))), 
              size = n, replace = FALSE)
train <- hitters[trn,]
tst <- !trn 
test <- hitters[tst,]

# Function that returns RMSE
# Takes in model and test distribution as input
getRMSE <- function(mod, test) {
  set.seed(1) # set seed
  pred <- predict(object = mod, newdata = test)
  RMSE <- sqrt(mean((test$Salary - pred)^2))
  return (RMSE)
}

# Full model 
mod <- lm(Salary ~ ., data = train)
RMSEfull <- getRMSE(mod, test)

# Optimal all-possible regressions model
mod <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + 
                        CWalks + League + Division + PutOuts + Assists, 
                      data = train)
RMSEoptimum <- getRMSE(mod, test)

# Best stepwise-selection model
sm <- lm(Salary ~ 1, data = train)
lg <- lm(Salary ~ ., data = train)

forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                   direction = "forward", trace = 0)
RMSEforward <- getRMSE(forward, test)
summary(forward)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits + PutOuts + Walks + AtBat + 
##     Division, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -899.01 -184.52  -15.05  161.11 1102.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   69.98218   74.01052   0.946 0.345492    
## CRBI           0.61066    0.06810   8.967  < 2e-16 ***
## Hits           7.14081    1.70682   4.184 4.27e-05 ***
## PutOuts        0.26108    0.07594   3.438 0.000710 ***
## Walks          4.88772    1.30202   3.754 0.000227 ***
## AtBat         -1.81945    0.55614  -3.272 0.001257 ** 
## DivisionW   -105.10370   42.84764  -2.453 0.015012 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 305 on 203 degrees of freedom
## Multiple R-squared:  0.5238, Adjusted R-squared:  0.5098 
## F-statistic: 37.22 on 6 and 203 DF,  p-value: < 2.2e-16

backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm), 
                    direction = "backward", trace = 0)
RMSEbackward <- getRMSE(backward, test)
summary(backward)

## 
## Call:
## lm(formula = Salary ~ AtBat + Hits + Walks + CRuns + CRBI + CWalks + 
##     Division + PutOuts, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -809.9 -170.7  -11.2  153.7 1110.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   92.6051    74.1619   1.249 0.213232    
## AtBat         -1.9475     0.5519  -3.529 0.000518 ***
## Hits           6.3521     1.7037   3.728 0.000250 ***
## Walks          7.6122     1.6696   4.559 8.91e-06 ***
## CRuns          0.6670     0.2609   2.557 0.011297 *  
## CRBI           0.5137     0.1930   2.661 0.008416 ** 
## CWalks        -0.7969     0.2925  -2.724 0.007016 ** 
## DivisionW   -110.3624    42.4453  -2.600 0.010011 *  
## PutOuts        0.2673     0.0757   3.531 0.000513 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 300.4 on 201 degrees of freedom
## Multiple R-squared:  0.5426, Adjusted R-squared:  0.5244 
## F-statistic: 29.81 on 8 and 201 DF,  p-value: < 2.2e-16

hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                  direction = "both", trace = 0)
RMSEhybrid <- getRMSE(hybrid, test)
summary(hybrid)

## 
## Call:
## lm(formula = Salary ~ CRBI + Hits + PutOuts + Walks + AtBat + 
##     Division, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -899.01 -184.52  -15.05  161.11 1102.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   69.98218   74.01052   0.946 0.345492    
## CRBI           0.61066    0.06810   8.967  < 2e-16 ***
## Hits           7.14081    1.70682   4.184 4.27e-05 ***
## PutOuts        0.26108    0.07594   3.438 0.000710 ***
## Walks          4.88772    1.30202   3.754 0.000227 ***
## AtBat         -1.81945    0.55614  -3.272 0.001257 ** 
## DivisionW   -105.10370   42.84764  -2.453 0.015012 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 305 on 203 degrees of freedom
## Multiple R-squared:  0.5238, Adjusted R-squared:  0.5098 
## F-statistic: 37.22 on 6 and 203 DF,  p-value: < 2.2e-16

print(paste("Full model CV RMSE:", round(RMSEfull, 3)))

## [1] "Full model CV RMSE: 373.739"

print(paste("Optimal model CV RMSE:", round(RMSEoptimum, 3)))

## [1] "Optimal model CV RMSE: 362.098"

print(paste("Forward selction CV RMSE:", round(RMSEforward, 3)))

## [1] "Forward selction CV RMSE: 375.272"

print(paste("Backward selection CV RMSE:", round(RMSEbackward, 3)))

## [1] "Backward selection CV RMSE: 368.284"

print(paste("Hybrid selection CV RMSE:", round(RMSEhybrid, 3)))

## [1] "Hybrid selection CV RMSE: 375.272"

The optimal model appears to be the most appropriate since it has the lowest cross validation RMSE among all models fitted.

# Full model 
mod <- glm(Salary ~ ., data = hitters)
set.seed(1)
RMSEfull_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])

# Optimal all-possible regressions model
mod <- glm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + CRBI + 
                        CWalks + League + Division + PutOuts + Assists, 
                      data = hitters)
set.seed(1)
RMSEoptimum_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])

# Best stepwise-selection model
sm <- lm(Salary ~ 1, data = hitters)
lg <- lm(Salary ~ ., data = hitters)

forward <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                   direction = "forward", trace = 0)
mod <- glm(forward)
set.seed(1)
RMSEforward_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])

backward <- stepAIC(object = lg, scope = list(upper = lg, lower = sm), 
                    direction = "backward", trace = 0)
mod <- glm(backward)
set.seed(1)
RMSEbackward_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])

hybrid <- stepAIC(object = sm, scope = list(upper = lg, lower = sm), 
                  direction = "both", trace = 0)
mod <- glm(hybrid)
set.seed(1)
RMSEhybrid_K <- sqrt((cv.glm(hitters, mod, K = 10)$delta)[1])

print(paste("Full model K-fold RMSE:", round(RMSEfull_K, 3)))

## [1] "Full model K-fold RMSE: 346.322"

print(paste("Optimal model K-fold RMSE:", round(RMSEoptimum_K, 3)))

## [1] "Optimal model K-fold RMSE: 332.789"

print(paste("Forward selection K-fold RMSE:", round(RMSEforward_K, 3)))

## [1] "Forward selection K-fold RMSE: 331.743"

print(paste("Backward selection K-fold RMSE:", round(RMSEbackward_K, 3)))

## [1] "Backward selection K-fold RMSE: 331.743"

print(paste("Hybrid selection K-fold RMSE:", round(RMSEhybrid_K, 3)))

## [1] "Hybrid selection K-fold RMSE: 331.743"

Overall, hybrid stepwise-selection seems to be the model of choice. In this case, the variables in the stepwise-selection methods all seem to be the same, but this may not always be the case when multicolliearity is present. If multicollinearity was present among the variables of the model, the hybrid approach would output a considerably more accurate model, since it allows for the exit of variables. Hybrid stepwise-selection also gives a good predictive accuracy relative to the other models using k-fold cross validation. It does not result in the lowest RMSE, but is relatively close to that of the optimal model, and considerably lower than that of the other stepwise-selection methods. Time-wise, hybrid stepwise-selection out performs the optimal model, since it requires fewer computational steps, though the optimal model seems to output a model that is comparable in predictive accuracy, if not better.

## Shrinkage/Regularization Methods
## Ridge and LASSO
X <- model.matrix(Salary ~ ., hitters)[,-1]
y <- hitters$Salary

grid <- 10^seq(10, -3, length=1000)  # Set grid

# RIDGE
ridge.mod <- glmnet(X, y, alpha=0, lambda=grid)
plot(ridge.mod, xvar = "lambda", label = TRUE)  # Plot param ests against lambda

## LASSO
lasso.mod <- glmnet(X, y, alpha=1, lambda=grid)
plot(lasso.mod, xvar = "lambda", label = TRUE)  # Plot param ests against lambda

set.seed(1)  
train <- sample(1:nrow(X), 210)
test <- (-train)
y.test <- y[test]

# RIDGE
cv.out <- cv.glmnet(X[train,], y[train], alpha=0)  # Find best lambda
plot(cv.out)

bestlam.r <- cv.out$lambda.min
print(paste("Optimal lambda:", round(bestlam.r,3)))

## [1] "Optimal lambda: 36.576"

predict(ridge.mod, s=bestlam.r, type = "coefficients")[1:20,]  # Best ridge regression model

##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  6.241428e+01 -4.930832e-01  2.296882e+00 -1.385474e+00  1.110477e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  7.755174e-01  2.999884e+00 -7.563493e+00  3.691778e-03  1.198356e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.621278e-01  2.470793e-01  2.338690e-01 -2.058511e-01  4.961218e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.208793e+02  2.572746e-01  1.425053e-01 -3.501714e+00 -1.372009e+01

# LASSO
cv.out <- cv.glmnet(X[train,], y[train], alpha=1)  # Find best lambda
plot(cv.out)

bestlam.l <- cv.out$lambda.min
print(paste("Optimal lambda:", round(bestlam.l,3)))

## [1] "Optimal lambda: 2.641"

predict(lasso.mod, s=bestlam.l, type = "coefficients")[1:20,]  # Best LASSO regression model

##  (Intercept)        AtBat         Hits        HmRun         Runs 
##  124.1430078   -1.5557245    5.6830743    0.0000000    0.0000000 
##          RBI        Walks        Years       CAtBat        CHits 
##    0.0000000    4.7375354   -9.5924702    0.0000000    0.0000000 
##       CHmRun        CRuns         CRBI       CWalks      LeagueN 
##    0.5229904    0.6637874    0.3875519   -0.5318693   32.2219192 
##    DivisionW      PutOuts      Assists       Errors   NewLeagueN 
## -119.2561455    0.2727829    0.1747881   -2.0717182    0.0000000

In the ridge model, the explanatory variables have been minimised where insignificant variables are close to 0, but are not zero, and the significant variables are small. This is contrasted with the LASSO model where insignificant variables are zero, and the significant variables are large, relative to the significant values of the ridge model. Both models are similar in identifying which variables are significant and which are not.

trn <- hitters[train,]
tst <- hitters[-train,] # Held-out test set

stepwise <- lm(Salary ~ AtBat + Hits + Walks + CAtBat + CRuns + 
                 CRBI + CWalks + Division + PutOuts + Assists, data = trn)
step.pred <- predict(stepwise, tst)
RMSEs <- sqrt(mean((step.pred - y.test)^2))  # Predictive RMSE of best stepwise model

ridge.pred <- predict(ridge.mod, s=bestlam.r, newx=X[test,])
RMSEr <- sqrt(mean((ridge.pred - y.test)^2))  # Predictive RMSE of best ridge model

lasso.pred <- predict(lasso.mod, s=bestlam.l, newx=X[test,])
RMSEl <- sqrt(mean((lasso.pred - y.test)^2))  # Predictive RMSE of best LASSO model

print(paste("RMSE (Stepwise):", round(RMSEs,3)))

## [1] "RMSE (Stepwise): 245.978"

print(paste("RMSE (LASSO):", round(RMSEl,3)))

## [1] "RMSE (LASSO): 232.878"

print(paste("RMSE (Ridge):", round(RMSEr,3)))

## [1] "RMSE (Ridge): 245.276"

The RMSE of the best LASSO regression is lowest, followed by the best ridge regression, and then Best-Stepwise selection model. Based on this criterion, the LASSO model is the best.

Model Selection, Cross-Validation, and Regularization

Ien Whang

10/10/2017