Predicting Points in the NBA




Predicting Points Scored

The goal of this project was to determine which team statistics were the best predictors of points scored.

First, I loaded in the NBA team data from 2009-2018 and did some cleaning.

NBA_data <- read.csv("NBA_TEAM_DATA_2009-2018 - Sheet1.csv",header = T, stringsAsFactors = F)

NBA_data <- NBA_data[-seq(31,279,31),]

for(i in 2:length(NBA_data)){
  NBA_data[,i] <- as.numeric(NBA_data[,i])
}

The next part was determining the best predictors. To do so, I built a loop that would test all of the combinations of a certain number of variables (from 1 to 5, but I could test more), and then stored the combination that produced the smallest residual standard mean error.

n <- 5 # maximum number of variables to test
predictors <- matrix(NA,nrow = n, ncol = n+1) # create empty matrix to store the best predictors for a given number of variables (from 1 to n)

for(i in 1:n){
  permutations <- combn(19,i) # create all the combinations of the variables (there are 19 total and we are finding the best linear model using i of them (where i is from 1 to n))
  
  permutations <- permutations + 7 # columns 8-26 of NBA_data contain the variables to test, so we add 7 to the combinations so they match the column numbers
  
  num_permutations <- ncol(permutations)
  rsme <- matrix(NA,nrow=num_permutations,ncol= i+1) # create empty matrix to store the rsme of each combination of variables
  
  # For each combination of i variables (where i is 1:n)
  for(j in 1:num_permutations){
      variables <- rep(NA,i) # create empty vector to to be filled in with the variables being tested on that iteration
      
      for(k in 1:i){
        variables[k] <- names(NBA_data[permutations[k,j]]) # fill the vector with the appropriate variables corresponding to that combination
        
        rsme[j,k] <- names(NBA_data[permutations[k,j]]) # put the variable names in the rsme matrix so that the corresponding rsme can be recorded
      }
      
      # Create the formula for the linear model of PTS as a function of the combination of variables we've just established
      f <- as.formula(
        paste("PTS", 
              paste(variables, collapse = " + "), 
              sep = " ~ "))
    lm_points <- lm(formula = f,data = NBA_data) # make linear model
    rsme[j,i+1] <- rmse(lm_points) # record rsme
  }
  for(k in 1:i){
    predictors[i,k] <- rsme[which.min(rsme[,i+1]),k] # record in the matrix the variable names of the linear model that produced the lowest rsme
  }
  predictors[i,n+1] <- rsme[which.min(rsme[,i+1]),i+1] # record in that matrix the rsme as well
}
predictors
##      [,1]  [,2]       [,3]       [,4]       [,5]  [,6]                
## [1,] "FGM" NA         NA         NA         NA    "2.42463525665235"  
## [2,] "FGM" "FTM"      NA         NA         NA    "1.75788594659143"  
## [3,] "FGM" "X3.00.PM" "FTM"      NA         NA    "0.0782929027447956"
## [4,] "FGM" "FGA"      "X3.00.PM" "FTM"      NA    "0.0770910317436641"
## [5,] "FGM" "FGA"      "FG."      "X3.00.PM" "FTM" "0.072392019693894"

Now, we are going to test these linear models. To do this, we use the linear model created from all years before 2017, and use this to predict points scored by teams in 2017, and then examine the error of this prediction.

NBA_data_2017 <- NBA_data[which(NBA_data$Year == 2017),] # create data fram with data only from 2017
NBA_data_other <- NBA_data[which(NBA_data$Year != 2017),] # create data from with all other data

# For each number of predictors (from 1 to 5)
for(i in 1:n){
  # Create linear model using the variables that had the smallest rsme, which are stored in the predictors matrix
  f <- as.formula(
    paste("PTS", 
          paste(predictors[i,1:i], collapse = " + "), 
          sep = " ~ "))
  lm_predict <- lm(formula = f, data = NBA_data_other) # create linear model for prediction
  string <- print(paste("PTS = ",round(lm_predict$coefficients[1],4)," + ",paste(round(lm_predict$coefficients[1:i+1],4),predictors[i,1:i], sep = "*",collapse = " + "),sep="")) # print linear model as an equation above the plot we are going to create
  predicted_points <- as.numeric(predict(lm_predict, newdata = NBA_data_2017)) # use linear model to predict points for 2017
  plot(predicted_points - NBA_data_2017$PTS,ylab = "Error of Prediction",ylim = c(-5,5)) # plot predicted points - actual points
  abline(h = 0, col = "red", lty = 2, lwd = 2) # draw a dotted line at 0
}  
## [1] "PTS = 3.9584 + 2.5668*FGM"

## [1] "PTS = -9.4133 + 2.4852*FGM + 0.9225*FTM"

## [1] "PTS = 0.0533 + 1.9999*FGM + 1.0004*X3.00.PM + 0.9972*FTM"

## [1] "PTS = -0.2856 + 1.9927*FGM + 0.0071*FGA + 0.9989*X3.00.PM + 0.9991*FTM"

## [1] "PTS = -18.006 + 1.5216*FGM + 0.2221*FGA + 0.3879*FG. + 0.9996*X3.00.PM + 0.9995*FTM"

From these plots, we can see that points scored is best represented as a function of Field Goals Made, 3-Pointers Made, and Free Throws Made. This is somewhat obvious and unhelpful since these are the three ways to score points, which is why the error is so small.

Using this result, I then decided to see if any other variables were good predictors of points. I repeated the analysis of testing all combinations, except I removed the field goals made, 3-pointers made, and free throws made from the NBA data.

NBA_data_2 <- NBA_data[,-seq(8,14,3)] # remove these three variables from data frame and create new one.


# The following code is identical to the other code above, only now we are testing combinations of only 16 variables, and these variables don't include FGM, FTM, and X3PM.
n_2 <- 5
predictors_2 <- matrix(NA,nrow = n_2, ncol = n_2+1)
for(i in 1:n_2){
  permutations_2 <- combn(16,i)
  permutations_2 <- permutations_2 + 7
  num_permutations_2 <- ncol(permutations_2)
  rsme_2 <- matrix(NA,nrow=num_permutations_2,ncol= i+1)
  for(j in 1:num_permutations_2){
    variables_2 <- rep(NA,i)
    for(k in 1:i){
      variables_2[k] <- names(NBA_data_2[permutations_2[k,j]])
      rsme_2[j,k] <- names(NBA_data_2[permutations_2[k,j]])
    }
    f <- as.formula(
      paste("PTS", 
            paste(variables_2, collapse = " + "), 
            sep = " ~ "))
    lm_points_2 <- lm(formula = f,data = NBA_data_2)
    rsme_2[j,i+1] <- rmse(lm_points_2)
  }
  for(k in 1:i){
    predictors_2[i,k] <- rsme_2[which.min(rsme_2[,i+1]),k]
  }
  predictors_2[i,n+1] <- rsme_2[which.min(rsme_2[,i+1]),i+1]
}
predictors_2
##      [,1]   [,2]  [,3]   [,4]  [,5]  [,6]               
## [1,] "X3PA" NA    NA     NA    NA    "3.80437519624434" 
## [2,] "FGA"  "FG." NA     NA    NA    "2.4267700785443"  
## [3,] "FGA"  "FG." "X3PA" NA    NA    "1.8094973543401"  
## [4,] "FGA"  "FG." "X3PA" "FTA" NA    "0.817917068667567"
## [5,] "FGA"  "FG." "X3PA" "FTA" "FT." "0.351140244926771"

Again, I tested the linear models that minimized rsme by comparing predictions to actual values from 2017.

# Same code again, just modified for the different results
NBA_data_2017_2 <- NBA_data_2[which(NBA_data_2$Year == 2017),]
NBA_data_other_2 <- NBA_data_2[which(NBA_data_2$Year != 2017),]

for(i in 1:n_2){
  f <- as.formula(
    paste("PTS", 
          paste(predictors_2[i,1:i], collapse = " + "), 
          sep = " ~ "))
  lm_predict_2 <- lm(formula = f, data = NBA_data_other_2)
  lm_predict_2
  predicted_points_2 <- as.numeric(predict(lm_predict_2, newdata = NBA_data_2017_2))
  string <- print(paste("PTS = ",round(lm_predict_2$coefficients[1],4)," + ",paste(round(lm_predict_2$coefficients[1:i+1],4),predictors_2[i,1:i], sep = "*",collapse = " + "),sep=""))
  plot(predicted_points_2 - NBA_data_2017_2$PTS,ylab = "Error of Prediction",ylim = c(-10,10))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
}  
## [1] "PTS = 88.0327 + 0.5932*X3PA"

## [1] "PTS = -92.2269 + 1.1327*FGA + 2.1771*FG."

## [1] "PTS = -69.9211 + 0.8337*FGA + 2.058*FG. + 0.3752*X3PA"

## [1] "PTS = -84.9749 + 0.9264*FGA + 1.8702*FG. + 0.3633*X3PA + 0.6895*FTA"

## [1] "PTS = -99.9078 + 0.8946*FGA + 1.7927*FG. + 0.3781*X3PA + 0.7262*FTA + 0.2622*FT."

These results are also pretty straightforward, since it seems to be that the best predictor of points is then a combination of field goal attempts, field goal percentage, 3-point attempts, and 3-point percentage, which are really just field goals and 3-pointers made in disguise. Also, the errors are much larger and may be useless in any sort of prediction.

I was interested to see if there was any sort of correlation between non-scoring statistics (field goals made, three point percentage, etc.) and points scored. I performed the same analysis to find the combination of variables minimizing rsme, not including field goals made, field goals attempted, field goal percentage, and the same three stats for three pointers and free throws.

NBA_data_3 <- NBA_data[,-seq(8,16,1)] # remove these variables from the data frame

# This code is the same as the other simulations above, except now we are only testing combinations of 11 variables.
n_3 <- 5
predictors_3 <- matrix(NA,nrow = n_3, ncol = n_3+1)
for(i in 1:n_3){
  permutations_3 <- combn(11,i)
  permutations_3 <- permutations_3 + 7
  num_permutations_3 <- ncol(permutations_3)
  rsme_3 <- matrix(NA,nrow=num_permutations_3,ncol= i+1)
  for(j in 1:num_permutations_3){
    variables_3 <- rep(NA,i)
    for(k in 1:i){
      variables_3[k] <- names(NBA_data_3[permutations_3[k,j]])
      rsme_3[j,k] <- names(NBA_data_3[permutations_3[k,j]])
    }
    f <- as.formula(
      paste("PTS", 
            paste(variables_3, collapse = " + "), 
            sep = " ~ "))
    lm_points_3 <- lm(formula = f,data = NBA_data_3)
    rsme_3[j,i+1] <- rmse(lm_points_3)
  }
  for(k in 1:i){
    predictors_3[i,k] <- rsme_3[which.min(rsme_3[,i+1]),k]
  }
  predictors_3[i,n+1] <- rsme_3[which.min(rsme_3[,i+1]),i+1]
}
predictors_3
##      [,1]   [,2]  [,3]  [,4]  [,5]  [,6]              
## [1,] "DREB" NA    NA    NA    NA    "4.03160433469852"
## [2,] "DREB" "AST" NA    NA    NA    "3.6549309736084" 
## [3,] "DREB" "AST" "PFD" NA    NA    "3.26695128031111"
## [4,] "DREB" "AST" "STL" "PFD" NA    "3.17808219419073"
## [5,] "DREB" "AST" "TOV" "STL" "PFD" "3.1129141261377"

I then plotted the error of the prediction for the 2017 season compared to the actual values for each combination of predictors minimizing rsme.

NBA_data_2017_3 <- NBA_data_3[which(NBA_data_3$Year == 2017),]
NBA_data_other_3 <- NBA_data_3[which(NBA_data_3$Year != 2017),]

for(i in 1:n_3){
  f <- as.formula(
    paste("PTS", 
          paste(predictors_3[i,1:i], collapse = " + "), 
          sep = " ~ "))
  lm_predict_3 <- lm(formula = f, data = NBA_data_other_3)
  lm_predict_3
  predicted_points_3 <- as.numeric(predict(lm_predict_3, newdata = NBA_data_2017_3))
  string <- print(paste("PTS = ",round(lm_predict_3$coefficients[1],4)," + ",paste(round(lm_predict_3$coefficients[1:i+1],4),predictors_3[i,1:i], sep = "*",collapse = " + "),sep=""))
  plot(predicted_points_3 - NBA_data_2017_3$PTS,ylab = "Error of Prediction",ylim=c(-10,10))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
} 
## [1] "PTS = 56.3024 + 1.3963*DREB"

## [1] "PTS = 46.0816 + 1.0471*DREB + 0.9769*AST"

## [1] "PTS = 18.2011 + 1.0419*DREB + 1.086*AST + 1.262*PFD"

## [1] "PTS = 13.7557 + 1.1261*DREB + 0.9266*AST + 0.8738*STL + 1.1944*PFD"

## [1] "PTS = 18.6874 + 1.1381*DREB + 0.9121*AST + -0.5523*TOV + 1.0465*STL + 1.2736*PFD"

It seems that Defensive Rebounds and Assists seem to be correlated with points scored, although the error of the predictions is quite large. Also, the predictions all appear to be skewed below the actual values, so I added the standard deviation of the prediction, which seems to be more centered around zero.

for(i in 1:n_3){
  f <- as.formula(
    paste("PTS", 
          paste(predictors_3[i,1:i], collapse = " + "), 
          sep = " ~ "))
  lm_predict_3 <- lm(formula = f, data = NBA_data_other_3)
  lm_predict_3
  predicted_points_3 <- as.numeric(predict(lm_predict_3, newdata = NBA_data_2017_3))
  string <- print(paste("PTS = ",round(lm_predict_3$coefficients[1],4)," + ",paste(round(lm_predict_3$coefficients[1:i+1],4),predictors_3[i,1:i], sep = "*",collapse = " + "),sep=""))
  plot(predicted_points_3 + sd(predicted_points_3) - NBA_data_2017_3$PTS,ylab = "Error of Prediction",ylim=c(-10,10))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
} 
## [1] "PTS = 56.3024 + 1.3963*DREB"

## [1] "PTS = 46.0816 + 1.0471*DREB + 0.9769*AST"

## [1] "PTS = 18.2011 + 1.0419*DREB + 1.086*AST + 1.262*PFD"

## [1] "PTS = 13.7557 + 1.1261*DREB + 0.9266*AST + 0.8738*STL + 1.1944*PFD"

## [1] "PTS = 18.6874 + 1.1381*DREB + 0.9121*AST + -0.5523*TOV + 1.0465*STL + 1.2736*PFD"