R: NBA End of Season Predictions



How We Make NBA End of Season Predictions

The first thing we do is scrape the data off of basketball reference for the entire 2017-2018 season. If you’ve read our piece on how we make the NBA rankings, this will look pretty familiar. The main difference is that we are not removing games that are still to come

library(XML)
library(RCurl)
months <- c("october", "november", "december", "january", "february", "march", "april")

for(i in 1:length(months)){
  u <- paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-",months[i],".html")
  newu <- getURL(u)
  data <- readHTMLTable(newu)
  
  if(i==1){
    
    season <- data$schedule
    season$`Visitor/Neutral` <- as.character(season$`Visitor/Neutral`)
    season$`Home/Neutral` <- as.character(season$`Home/Neutral`)
    season$PTSV <- as.numeric(as.character(season[[4]]))
    season$PTSH <- as.numeric(as.character(season[[6]]))
    
    
  }
  
  else{
    season1 <- data$schedule
    season1$`Visitor/Neutral` <- as.character(season1$`Visitor/Neutral`)
    season1$`Home/Neutral` <- as.character(season1$`Home/Neutral`)
    season1$PTSV <- as.numeric(as.character(season1[[4]]))
    season1$PTSH <- as.numeric(as.character(season1[[6]]))
    season <- rbind(season, season1)
    
  }
  if(i != 1 && i != length(months)){
    remove(season1)
  }
  
}
head(season)
##                Date Start (ET)   Visitor/Neutral PTS          Home/Neutral
## 1 Tue, Oct 17, 2017    8:01 pm    Boston Celtics  99   Cleveland Cavaliers
## 2 Tue, Oct 17, 2017   10:30 pm   Houston Rockets 122 Golden State Warriors
## 3 Wed, Oct 18, 2017    7:30 pm   Milwaukee Bucks 108        Boston Celtics
## 4 Wed, Oct 18, 2017    8:30 pm     Atlanta Hawks 117      Dallas Mavericks
## 5 Wed, Oct 18, 2017    7:00 pm Charlotte Hornets  90       Detroit Pistons
## 6 Wed, Oct 18, 2017    7:00 pm     Brooklyn Nets 131        Indiana Pacers
##   PTS            Attend. Notes PTSV PTSH
## 1 102 Box Score     20,562         99  102
## 2 121 Box Score     19,596        122  121
## 3 100 Box Score     18,624        108  100
## 4 111 Box Score     19,709        117  111
## 5 102 Box Score     20,491         90  102
## 6 140 Box Score     15,008        131  140

The data frame called season has our basic information stored. Next we will make an intermediate dataframe with information we need

inter <- data.frame(season$`Visitor/Neutral`, season$PTSV, season$`Home/Neutral`, season$PTSH)
inter$season..Visitor.Neutral. <- as.character(inter$season..Visitor.Neutral.)
inter$season..Home.Neutral. <- as.character(inter$season..Home.Neutral.)


inter$pt_dif <- inter$season.PTSH - inter$season.PTSV
head(inter)
##   season..Visitor.Neutral. season.PTSV season..Home.Neutral. season.PTSH
## 1           Boston Celtics          99   Cleveland Cavaliers         102
## 2          Houston Rockets         122 Golden State Warriors         121
## 3          Milwaukee Bucks         108        Boston Celtics         100
## 4            Atlanta Hawks         117      Dallas Mavericks         111
## 5        Charlotte Hornets          90       Detroit Pistons         102
## 6            Brooklyn Nets         131        Indiana Pacers         140
##   pt_dif
## 1      3
## 2     -1
## 3     -8
## 4     -6
## 5     12
## 6      9

Next we will put the data into a frame called cleanse, with team, opponent, location, data, and ptdif as the only columns

team <- character(length = 2*length(inter$pt_dif))
opponent <- character(length = 2*length(inter$pt_dif))
location <- character(length = 2*length(inter$pt_dif))
ptdif <- vector(mode='numeric',length = 2*length(inter$pt_dif))


cleanse <- data.frame(team,opponent,location,ptdif)

cleanse$team <- as.character(cleanse$team)
cleanse$opponent <- as.character(cleanse$opponent)
cleanse$location <- as.character(cleanse$location)
cleanse$date <- character(length = 2*length(inter$pt_dif))

for(i in 1:length(inter$pt_dif)){
  
  #Here, we double count games so that each game has an entry for the home team being "team" and the away team being "team"
  cleanse$date[i] <- as.character(season$Date[i])
  cleanse$team[i] <- inter$season..Home.Neutral.[i]
  cleanse$opponent[i] <- inter$season..Visitor.Neutral.[i]
  cleanse$location[i] <- "H"
  cleanse$ptdif[i] <- inter$pt_dif[i]
  
  
  cleanse$date[i + length(inter$pt_dif)] <- as.character(season$Date[i]) 
  cleanse$team[i + length(inter$pt_dif)] <- inter$season..Visitor.Neutral.[i]
  cleanse$opponent[i + length(inter$pt_dif)] <- inter$season..Home.Neutral.[i]
  cleanse$location[i + length(inter$pt_dif)] <- "A"
  cleanse$ptdif[i + length(inter$pt_dif)] <- (-1)*inter$pt_dif[i]
  
}
head(cleanse)
##                    team          opponent location ptdif              date
## 1   Cleveland Cavaliers    Boston Celtics        H     3 Tue, Oct 17, 2017
## 2 Golden State Warriors   Houston Rockets        H    -1 Tue, Oct 17, 2017
## 3        Boston Celtics   Milwaukee Bucks        H    -8 Wed, Oct 18, 2017
## 4      Dallas Mavericks     Atlanta Hawks        H    -6 Wed, Oct 18, 2017
## 5       Detroit Pistons Charlotte Hornets        H    12 Wed, Oct 18, 2017
## 6        Indiana Pacers     Brooklyn Nets        H     9 Wed, Oct 18, 2017

Now we have to make sure the data frame earns its name by cleaning it up! This includes adjusting for neutral location games, as well as adding a new column with a binary “did this team win?”.

cleanse$location[363] <- "N"
cleanse$location[376] <- "N"
cleanse$location[615] <- "N"
cleanse$location[363 + length(inter$pt_dif)] <- "N"
cleanse$location[376 + length(inter$pt_dif)] <- "N"
cleanse$location[615+ length(inter$pt_dif)] <- "N"

cleanse$win <- rep(NA, length(cleanse$date))

for(i in 1:length(inter$pt_dif)){
 
  if(!is.na(cleanse$ptdif[i])){
    
    if(cleanse$ptdif[i] > 0){
      cleanse$win[i] <- 1
    }
    else if(cleanse$ptdif[i] < 0){
      cleanse$win[i] <- 0
    }
    
    if(cleanse$ptdif[i + length(inter$pt_dif)] > 0){
      cleanse$win[i + length(inter$pt_dif)] <- 1
    }
    else if(cleanse$ptdif[i + length(inter$pt_dif)] < 0){
      cleanse$win[i + length(inter$pt_dif)] <- 0
    }  
  }
  
  
}
cleanse <- cleanse[c(5,1,2,3,4,6)]
head(cleanse)
##                date                  team          opponent location ptdif
## 1 Tue, Oct 17, 2017   Cleveland Cavaliers    Boston Celtics        H     3
## 2 Tue, Oct 17, 2017 Golden State Warriors   Houston Rockets        H    -1
## 3 Wed, Oct 18, 2017        Boston Celtics   Milwaukee Bucks        H    -8
## 4 Wed, Oct 18, 2017      Dallas Mavericks     Atlanta Hawks        H    -6
## 5 Wed, Oct 18, 2017       Detroit Pistons Charlotte Hornets        H    12
## 6 Wed, Oct 18, 2017        Indiana Pacers     Brooklyn Nets        H     9
##   win
## 1   1
## 2   0
## 3   0
## 4   0
## 5   1
## 6   1

Now we’re ready to start making predictions! The first step is creating two new columns: one for simulated score differential and one for win probability. We can start out with the info we already have as a base. In every simulated season we perform, the simulated score differential for a past game will be the same. This of course means the win probability is 1 or 0, since it has already happened.

cleanse$sim_ptdif <- cleanse$ptdif
cleanse$win_prob <- cleanse$win

Now we define a function that takes in a team, opponent, and location, then uses our two linear models to predict the score differential and win probability. Check out our other piece on how we make the linear models!

ptdif_call <- function(home,away,HNA){
  
  arr <- c(0,0)
  
  r1 <- rankings$yusag_coeff[which(rankings$team == home)]
  r2 <- rankings$yusag_coeff[which(rankings$team == away)]
  
  
  if(HNA == "H"){
    pt_dif <- r1 - r2 - coefficients(lm.NBAhoops)[[1]]
  }
  
  if(HNA == "N"){
    pt_dif <- r1 - r2
  }
  
  if(HNA == "A"){
    pt_dif <- r1 - r2 + coefficients(lm.NBAhoops)[[1]]
  }
  
  arr[1] <- pt_dif
  
  prob <- 1 / (1+ exp(- coefficients(glm.pointspread)[[2]] * pt_dif))
  arr[2] <- prob
  
  return(arr)
}

The next thing we do is call this function for each game that has not yet happened to fill out the empty entries in our predicted point differential and win probability

for(i in 1:(length(cleanse$ptdif))){
  
  if(is.na(cleanse$ptdif[i])){
    cleanse$sim_ptdif[i] <- ptdif_call(cleanse$team[i], cleanse$opponent[i], cleanse$location[i])[1]
    cleanse$win_prob[i] <- ptdif_call(cleanse$team[i], cleanse$opponent[i], cleanse$location[i])[2]
  }
  
}

In order to find our probabilites, we need to define a mostly empty data frame with each team seperated by conference. This is important because until the NBA just lets the 16 best teams in the playoffs, we’ll have western conference snubs that would make it in the east.

west <- c("Golden State Warriors", "Dallas Mavericks", "Memphis Grizzlies", "Phoenix Suns", "Sacramento Kings", "San Antonio Spurs", "Utah Jazz", "Los Angeles Lakers","Oklahoma City Thunder", "Minnesota Timberwolves", "New Orleans Pelicans", "Denver Nuggets", "Houston Rockets", "Los Angeles Clippers", "Portland Trail Blazers")
east <- setdiff(rankings$team, west)

wins_current <- rep(NA, 15)
wins_projected <- rep(NA, 15)
playoff_prob <- rep(NA, 15)
top_four_prob <- rep(NA, 15)
top_two_prob <- rep(NA, 15)
top_seed_prob <- rep(NA,15)


west_data <- data.frame(west,wins_current,wins_projected,playoff_prob, top_four_prob, top_two_prob, top_seed_prob, stringsAsFactors = FALSE)
east_data <- data.frame(east,wins_current,wins_projected,playoff_prob, top_four_prob, top_two_prob, top_seed_prob, stringsAsFactors = FALSE)
#This will all be filled in eventually
west_data
##                      west wins_current wins_projected playoff_prob
## 1   Golden State Warriors           NA             NA           NA
## 2        Dallas Mavericks           NA             NA           NA
## 3       Memphis Grizzlies           NA             NA           NA
## 4            Phoenix Suns           NA             NA           NA
## 5        Sacramento Kings           NA             NA           NA
## 6       San Antonio Spurs           NA             NA           NA
## 7               Utah Jazz           NA             NA           NA
## 8      Los Angeles Lakers           NA             NA           NA
## 9   Oklahoma City Thunder           NA             NA           NA
## 10 Minnesota Timberwolves           NA             NA           NA
## 11   New Orleans Pelicans           NA             NA           NA
## 12         Denver Nuggets           NA             NA           NA
## 13        Houston Rockets           NA             NA           NA
## 14   Los Angeles Clippers           NA             NA           NA
## 15 Portland Trail Blazers           NA             NA           NA
##    top_four_prob top_two_prob top_seed_prob
## 1             NA           NA            NA
## 2             NA           NA            NA
## 3             NA           NA            NA
## 4             NA           NA            NA
## 5             NA           NA            NA
## 6             NA           NA            NA
## 7             NA           NA            NA
## 8             NA           NA            NA
## 9             NA           NA            NA
## 10            NA           NA            NA
## 11            NA           NA            NA
## 12            NA           NA            NA
## 13            NA           NA            NA
## 14            NA           NA            NA
## 15            NA           NA            NA

Here we start to fill in our data frame. We add in the YUSAG coefficients. We also find the current wins and expected wins by summing up the win and win probability columns, respectively

west_data$yusag_coeff <- rep(NA, 15)
east_data$yusag_coeff <- rep(NA, 15)

rankings$team <- as.character(rankings$team)

for(i in 1:15){
  
  west_data$wins_current[i] <- sum(cleanse$win[cleanse$team == west_data$west[i]], na.rm = TRUE)
  east_data$wins_current[i] <- sum(cleanse$win[cleanse$team == east_data$east[i]], na.rm = TRUE)
  
  west_data$wins_projected[i] <- sum(cleanse$win_prob[cleanse$team == west_data$west[i]], na.rm = TRUE)
  east_data$wins_projected[i] <- sum(cleanse$win_prob[cleanse$team == east_data$east[i]], na.rm = TRUE)
  
  west_data$yusag_coeff[i] <- rankings$yusag_coeff[which(rankings$team == west_data$west[i])]
  east_data$yusag_coeff[i] <- rankings$yusag_coeff[which(rankings$team == east_data$east[i])]
}

east_data <- east_data[c(1,2,3,8,4,5,6,7)]
west_data <- west_data[c(1,2,3,8,4,5,6,7)]

Now it’s time to simulate seasons! We will create a new data frame for each conference where we tally up the data for each simulated season ie count how many of the seasons a team makes the playoffs.

experimental_east <- data.frame(east, stringsAsFactors = FALSE)
experimental_west <- data.frame(west, stringsAsFactors = FALSE)
experimental_east$sample_wins <- rep(NA, 15)
experimental_west$sample_wins <- rep(NA, 15)
experimental_east$playoff_counts <- rep(0, 15)
experimental_west$playoff_counts <- rep(0, 15)
experimental_east$four_counts <- rep(0, 15)
experimental_west$four_counts <- rep(0, 15)
experimental_east$two_counts <- rep(0, 15)
experimental_west$two_counts <- rep(0, 15)
experimental_east$one_counts <- rep(0, 15)
experimental_west$one_counts <- rep(0, 15)

Now we simulate N seasons. I like 15000. The more seasons we simulate, the closer we will get to the actual probability that our model suggests.

N <- 15000
cleanse$sim_win <- cleanse$win
min_game <- min(which(is.na(cleanse$sim_win)))

for(j in 1:N){
  #To simulate a season, we generate 1230 random numbers between 1 and 0, one for each game of the season
  play_games <- runif(1230)
  for(i in (min_game):1230){
    
    #For a given game, the winner is decided by wheather or not the random number generated is less than the win probability
    if(play_games[i] < cleanse$win_prob[i]){
      cleanse$sim_win[i] <- 1
      cleanse$sim_win[i + 1230] <- 0
    }
    else{
      cleanse$sim_win[i] <- 0
      cleanse$sim_win[i + 1230] <- 1
    }
    
  }
  for(k in 1:15){
     experimental_east$sample_wins[k] <- sum(cleanse$sim_win[cleanse$team == experimental_east$east[k]])
    experimental_west$sample_wins[k] <- sum(cleanse$sim_win[cleanse$team == experimental_west$west[k]])    
  }

  #The way we collect data is by sorting by wins in a simulated season, then adding a count to the team that finishes either 1, top 2, top 4, or top 8 in each conference
  experimental_east <- experimental_east[order(experimental_east$sample_wins,decreasing = TRUE),]
  experimental_west <- experimental_west[order(experimental_west$sample_wins,decreasing = TRUE),]
  
  experimental_east$playoff_counts[1:8] <- experimental_east$playoff_counts[1:8] + 1
  experimental_west$playoff_counts[1:8] <- experimental_west$playoff_counts[1:8] + 1
  
  experimental_east$four_counts[1:4] <- experimental_east$four_counts[1:4] + 1
  experimental_west$four_counts[1:4] <- experimental_west$four_counts[1:4] + 1
  
  experimental_east$two_counts[1:2] <- experimental_east$two_counts[1:2] + 1
  experimental_west$two_counts[1:2] <- experimental_west$two_counts[1:2] + 1
  
  experimental_east$one_counts[1] <- experimental_east$one_counts[1] + 1
  experimental_west$one_counts[1] <- experimental_west$one_counts[1] + 1

  if(j %% 1000 == 0){
    print(j)
  }
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
## [1] 11000
## [1] 12000
## [1] 13000
## [1] 14000
## [1] 15000

Now that we have simulated N seasons, we can add the probabilities to our data frame by dividing the number of occurences by the number of trials.

for(i in 1:15){
  west_data$playoff_prob[i] <- experimental_west$playoff_counts[which(experimental_west$west == west_data$west[i])] / N
  east_data$playoff_prob[i] <- experimental_east$playoff_counts[which(experimental_east$east == east_data$east[i])] / N
  
  west_data$top_four_prob[i] <- experimental_west$four_counts[which(experimental_west$west == west_data$west[i])] / N
  east_data$top_four_prob[i] <- experimental_east$four_counts[which(experimental_east$east == east_data$east[i])] / N
  
  west_data$top_two_prob[i] <- experimental_west$two_counts[which(experimental_west$west == west_data$west[i])] / N
  east_data$top_two_prob[i] <- experimental_east$two_counts[which(experimental_east$east == east_data$east[i])] / N
  
  west_data$top_seed_prob[i] <- experimental_west$one_counts[which(experimental_west$west == west_data$west[i])] / N
  east_data$top_seed_prob[i] <- experimental_east$one_counts[which(experimental_east$east == east_data$east[i])] / N
}

Now we just have some last minute cleaning to do, including rounding some values, sorting our results, and writing these results to a csv file

west_data <- west_data[order(west_data$wins_projected, decreasing = TRUE),]
east_data <- east_data[order(east_data$wins_projected, decreasing = TRUE),]
colnames(west_data)[1] <- "team"
colnames(east_data)[1] <- "team"

west_data$playoff_prob <- round(west_data$playoff_prob, digits = 3)
east_data$playoff_prob <- round(east_data$playoff_prob, digits = 3)

west_data$top_seed_prob <- round(west_data$top_seed_prob, digits = 3)
east_data$top_seed_prob <- round(east_data$top_seed_prob, digits = 3)

west_data$top_two_prob <- round(west_data$top_two_prob, digits = 3)
east_data$top_two_prob <- round(east_data$top_two_prob, digits = 3)

west_data$top_four_prob <- round(west_data$top_four_prob, digits = 3)
east_data$top_four_prob <- round(east_data$top_four_prob, digits = 3)

west_data$wins_projected <- round(west_data$wins_projected, digits = 1)
east_data$wins_projected <- round(east_data$wins_projected, digits = 1)

west_data$yusag_coeff <- round(west_data$yusag_coeff, digits = 2)
east_data$yusag_coeff <- round(east_data$yusag_coeff, digits = 2)

write.csv(west_data, "WEST_RANKINGS.csv", row.names = FALSE)
write.csv(east_data, "EAST_RANKINGS.csv", row.names = FALSE)

#Let's look at the results
east_data
##                   team wins_current wins_projected yusag_coeff
## 1      Toronto Raptors           52           61.3        7.84
## 2       Boston Celtics           47           54.4        3.28
## 3   Philadelphia 76ers           39           48.1        2.60
## 6  Cleveland Cavaliers           41           47.1       -0.22
## 4   Washington Wizards           40           46.6        1.05
## 5       Indiana Pacers           41           46.0        0.71
## 7           Miami Heat           38           44.1       -0.26
## 8      Milwaukee Bucks           37           42.6       -0.83
## 10     Detroit Pistons           31           37.1       -1.54
## 9    Charlotte Hornets           30           35.7       -1.08
## 11     New York Knicks           26           30.4       -3.33
## 15       Chicago Bulls           24           27.7       -6.06
## 12       Brooklyn Nets           23           27.0       -4.22
## 13       Orlando Magic           21           25.9       -4.74
## 14       Atlanta Hawks           20           23.0       -5.65
##    playoff_prob top_four_prob top_two_prob top_seed_prob
## 1         1.000         1.000        1.000         0.999
## 2         1.000         1.000        0.999         0.001
## 3         1.000         0.809        0.001         0.000
## 6         1.000         0.558        0.000         0.000
## 4         1.000         0.399        0.000         0.000
## 5         1.000         0.212        0.000         0.000
## 7         1.000         0.020        0.000         0.000
## 8         0.996         0.002        0.000         0.000
## 10        0.004         0.000        0.000         0.000
## 9         0.000         0.000        0.000         0.000
## 11        0.000         0.000        0.000         0.000
## 15        0.000         0.000        0.000         0.000
## 12        0.000         0.000        0.000         0.000
## 13        0.000         0.000        0.000         0.000
## 14        0.000         0.000        0.000         0.000
west_data
##                      team wins_current wins_projected yusag_coeff
## 13        Houston Rockets           56           65.9        8.79
## 1   Golden State Warriors           53           61.7        7.37
## 15 Portland Trail Blazers           44           50.2        2.48
## 9   Oklahoma City Thunder           43           48.3        2.88
## 6       San Antonio Spurs           41           47.3        2.93
## 10 Minnesota Timberwolves           40           47.0        2.40
## 7               Utah Jazz           40           46.9        3.22
## 11   New Orleans Pelicans           40           46.2        0.58
## 14   Los Angeles Clippers           37           43.5        0.73
## 12         Denver Nuggets           38           43.0        0.69
## 8      Los Angeles Lakers           31           36.3       -1.61
## 2        Dallas Mavericks           22           27.4       -2.06
## 5        Sacramento Kings           23           25.8       -7.27
## 3       Memphis Grizzlies           19           22.4       -5.21
## 4            Phoenix Suns           19           21.2       -9.10
##    playoff_prob top_four_prob top_two_prob top_seed_prob
## 13        1.000         1.000            1         0.996
## 1         1.000         1.000            1         0.004
## 15        1.000         0.906            0         0.000
## 9         0.995         0.505            0         0.000
## 6         0.970         0.231            0         0.000
## 10        0.953         0.144            0         0.000
## 7         0.952         0.145            0         0.000
## 11        0.892         0.067            0         0.000
## 14        0.160         0.001            0         0.000
## 12        0.078         0.000            0         0.000
## 8         0.000         0.000            0         0.000
## 2         0.000         0.000            0         0.000
## 5         0.000         0.000            0         0.000
## 3         0.000         0.000            0         0.000
## 4         0.000         0.000            0         0.000

Be sure to follow @YUSAG_NBA on twitter for the latest updates. Thanks for reading!