Predicting Points in the NBA




Predicting Points Scored

The goal of this project was to determine which team statistics were the best predictors of points scored.

First, I loaded in the NBA team data from 2009-2018 and did some cleaning.

NBA_data <- read.csv("NBA_TEAM_DATA_2009-2018 - Sheet1.csv",header = T, stringsAsFactors = F)

NBA_data <- NBA_data[-seq(31,279,31),]

for(i in 2:length(NBA_data)){
  NBA_data[,i] <- as.numeric(NBA_data[,i])
}

The next part was determining the best predictors. To do so, I built a loop that would test all of the combinations of a certain number of variables (from 1 to 5, but I could test more), and then stored the combination that produced the smallest residual standard mean error.

n <- 5 # maximum number of variables to test
predictors <- matrix(NA,nrow = n, ncol = n+1) # create empty matrix to store the best predictors for a given number of variables (from 1 to n)

for(i in 1:n){
  permutations <- combn(19,i) # create all the combinations of the variables (there are 19 total and we are finding the best linear model using i of them (where i is from 1 to n))
  
  permutations <- permutations + 7 # columns 8-26 of NBA_data contain the variables to test, so we add 7 to the combinations so they match the column numbers
  
  num_permutations <- ncol(permutations)
  rsme <- matrix(NA,nrow=num_permutations,ncol= i+1) # create empty matrix to store the rsme of each combination of variables
  
  # For each combination of i variables (where i is 1:n)
  for(j in 1:num_permutations){
      variables <- rep(NA,i) # create empty vector to to be filled in with the variables being tested on that iteration
      
      for(k in 1:i){
        variables[k] <- names(NBA_data[permutations[k,j]]) # fill the vector with the appropriate variables corresponding to that combination
        
        rsme[j,k] <- names(NBA_data[permutations[k,j]]) # put the variable names in the rsme matrix so that the corresponding rsme can be recorded
      }
      
      # Create the formula for the linear model of PTS as a function of the combination of variables we've just established
      f <- as.formula(
        paste("PTS", 
              paste(variables, collapse = " + "), 
              sep = " ~ "))
    lm_points <- lm(formula = f,data = NBA_data) # make linear model
    rsme[j,i+1] <- rmse(lm_points) # record rsme
  }
  for(k in 1:i){
    predictors[i,k] <- rsme[which.min(rsme[,i+1]),k] # record in the matrix the variable names of the linear model that produced the lowest rsme
  }
  predictors[i,n+1] <- rsme[which.min(rsme[,i+1]),i+1] # record in that matrix the rsme as well
}
predictors
##      [,1]  [,2]       [,3]       [,4]       [,5]  [,6]                
## [1,] "FGM" NA         NA         NA         NA    "2.42463525665235"  
## [2,] "FGM" "FTM"      NA         NA         NA    "1.75788594659143"  
## [3,] "FGM" "X3.00.PM" "FTM"      NA         NA    "0.0782929027447956"
## [4,] "FGM" "FGA"      "X3.00.PM" "FTM"      NA    "0.0770910317436641"
## [5,] "FGM" "FGA"      "FG."      "X3.00.PM" "FTM" "0.072392019693894"

Now, we are going to test these linear models. To do this, we use the linear model created from all years before 2017, and use this to predict points scored by teams in 2017, and then examine the error of this prediction.

NBA_data_2017 <- NBA_data[which(NBA_data$Year == 2017),] # create data fram with data only from 2017
NBA_data_other <- NBA_data[which(NBA_data$Year != 2017),] # create data from with all other data

# For each number of predictors (from 1 to 5)
for(i in 1:n){
  # Create linear model using the variables that had the smallest rsme, which are stored in the predictors matrix
  f <- as.formula(
    paste("PTS", 
          paste(predictors[i,1:i], collapse = " + "), 
          sep = " ~ "))
  lm_predict <- lm(formula = f, data = NBA_data_other) # create linear model for prediction
  string <- print(paste("PTS = ",round(lm_predict$coefficients[1],4)," + ",paste(round(lm_predict$coefficients[1:i+1],4),predictors[i,1:i], sep = "*",collapse = " + "),sep="")) # print linear model as an equation above the plot we are going to create
  predicted_points <- as.numeric(predict(lm_predict, newdata = NBA_data_2017)) # use linear model to predict points for 2017
  plot(predicted_points - NBA_data_2017$PTS,ylab = "Error of Prediction",ylim = c(-5,5)) # plot predicted points - actual points
  abline(h = 0, col = "red", lty = 2, lwd = 2) # draw a dotted line at 0
}  
## [1] "PTS = 3.9584 + 2.5668*FGM"

## [1] "PTS = -9.4133 + 2.4852*FGM + 0.9225*FTM"

## [1] "PTS = 0.0533 + 1.9999*FGM + 1.0004*X3.00.PM + 0.9972*FTM"

## [1] "PTS = -0.2856 + 1.9927*FGM + 0.0071*FGA + 0.9989*X3.00.PM + 0.9991*FTM"