How We Make NBA End of Season Predictions
daniel_tokarz
March 20, 2018
The first thing we do is scrape the data off of basketball reference for the entire 2017-2018 season. If you’ve read our piece on how we make the NBA rankings, this will look pretty familiar. The main difference is that we are not removing games that are still to come
library(XML)
library(RCurl)
months <- c("october", "november", "december", "january", "february", "march", "april")
for(i in 1:length(months)){
u <- paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-",months[i],".html")
newu <- getURL(u)
data <- readHTMLTable(newu)
if(i==1){
season <- data$schedule
season$`Visitor/Neutral` <- as.character(season$`Visitor/Neutral`)
season$`Home/Neutral` <- as.character(season$`Home/Neutral`)
season$PTSV <- as.numeric(as.character(season[[4]]))
season$PTSH <- as.numeric(as.character(season[[6]]))
}
else{
season1 <- data$schedule
season1$`Visitor/Neutral` <- as.character(season1$`Visitor/Neutral`)
season1$`Home/Neutral` <- as.character(season1$`Home/Neutral`)
season1$PTSV <- as.numeric(as.character(season1[[4]]))
season1$PTSH <- as.numeric(as.character(season1[[6]]))
season <- rbind(season, season1)
}
if(i != 1 && i != length(months)){
remove(season1)
}
}
head(season)
## Date Start (ET) Visitor/Neutral PTS Home/Neutral
## 1 Tue, Oct 17, 2017 8:01 pm Boston Celtics 99 Cleveland Cavaliers
## 2 Tue, Oct 17, 2017 10:30 pm Houston Rockets 122 Golden State Warriors
## 3 Wed, Oct 18, 2017 7:30 pm Milwaukee Bucks 108 Boston Celtics
## 4 Wed, Oct 18, 2017 8:30 pm Atlanta Hawks 117 Dallas Mavericks
## 5 Wed, Oct 18, 2017 7:00 pm Charlotte Hornets 90 Detroit Pistons
## 6 Wed, Oct 18, 2017 7:00 pm Brooklyn Nets 131 Indiana Pacers
## PTS Â Â Attend. Notes PTSV PTSH
## 1 102 Box Score 20,562 99 102
## 2 121 Box Score 19,596 122 121
## 3 100 Box Score 18,624 108 100
## 4 111 Box Score 19,709 117 111
## 5 102 Box Score 20,491 90 102
## 6 140 Box Score 15,008 131 140
The data frame called season has our basic information stored. Next we will make an intermediate dataframe with information we need
inter <- data.frame(season$`Visitor/Neutral`, season$PTSV, season$`Home/Neutral`, season$PTSH)
inter$season..Visitor.Neutral. <- as.character(inter$season..Visitor.Neutral.)
inter$season..Home.Neutral. <- as.character(inter$season..Home.Neutral.)
inter$pt_dif <- inter$season.PTSH - inter$season.PTSV
head(inter)
## season..Visitor.Neutral. season.PTSV season..Home.Neutral. season.PTSH
## 1 Boston Celtics 99 Cleveland Cavaliers 102
## 2 Houston Rockets 122 Golden State Warriors 121
## 3 Milwaukee Bucks 108 Boston Celtics 100
## 4 Atlanta Hawks 117 Dallas Mavericks 111
## 5 Charlotte Hornets 90 Detroit Pistons 102
## 6 Brooklyn Nets 131 Indiana Pacers 140
## pt_dif
## 1 3
## 2 -1
## 3 -8
## 4 -6
## 5 12
## 6 9
Next we will put the data into a frame called cleanse, with team, opponent, location, data, and ptdif as the only columns
team <- character(length = 2*length(inter$pt_dif))
opponent <- character(length = 2*length(inter$pt_dif))
location <- character(length = 2*length(inter$pt_dif))
ptdif <- vector(mode='numeric',length = 2*length(inter$pt_dif))
cleanse <- data.frame(team,opponent,location,ptdif)
cleanse$team <- as.character(cleanse$team)
cleanse$opponent <- as.character(cleanse$opponent)
cleanse$location <- as.character(cleanse$location)
cleanse$date <- character(length = 2*length(inter$pt_dif))
for(i in 1:length(inter$pt_dif)){
#Here, we double count games so that each game has an entry for the home team being "team" and the away team being "team"
cleanse$date[i] <- as.character(season$Date[i])
cleanse$team[i] <- inter$season..Home.Neutral.[i]
cleanse$opponent[i] <- inter$season..Visitor.Neutral.[i]
cleanse$location[i] <- "H"
cleanse$ptdif[i] <- inter$pt_dif[i]
cleanse$date[i + length(inter$pt_dif)] <- as.character(season$Date[i])
cleanse$team[i + length(inter$pt_dif)] <- inter$season..Visitor.Neutral.[i]
cleanse$opponent[i + length(inter$pt_dif)] <- inter$season..Home.Neutral.[i]
cleanse$location[i + length(inter$pt_dif)] <- "A"
cleanse$ptdif[i + length(inter$pt_dif)] <- (-1)*inter$pt_dif[i]
}
head(cleanse)
## team opponent location ptdif date
## 1 Cleveland Cavaliers Boston Celtics H 3 Tue, Oct 17, 2017
## 2 Golden State Warriors Houston Rockets H -1 Tue, Oct 17, 2017
## 3 Boston Celtics Milwaukee Bucks H -8 Wed, Oct 18, 2017
## 4 Dallas Mavericks Atlanta Hawks H -6 Wed, Oct 18, 2017
## 5 Detroit Pistons Charlotte Hornets H 12 Wed, Oct 18, 2017
## 6 Indiana Pacers Brooklyn Nets H 9 Wed, Oct 18, 2017
Now we have to make sure the data frame earns its name by cleaning it up! This includes adjusting for neutral location games, as well as adding a new column with a binary “did this team win?”.
cleanse$location[363] <- "N"
cleanse$location[376] <- "N"
cleanse$location[615] <- "N"
cleanse$location[363 + length(inter$pt_dif)] <- "N"
cleanse$location[376 + length(inter$pt_dif)] <- "N"
cleanse$location[615+ length(inter$pt_dif)] <- "N"
cleanse$win <- rep(NA, length(cleanse$date))
for(i in 1:length(inter$pt_dif)){
if(!is.na(cleanse$ptdif[i])){
if(cleanse$ptdif[i] > 0){
cleanse$win[i] <- 1
}
else if(cleanse$ptdif[i] < 0){
cleanse$win[i] <- 0
}
if(cleanse$ptdif[i + length(inter$pt_dif)] > 0){
cleanse$win[i + length(inter$pt_dif)] <- 1
}
else if(cleanse$ptdif[i + length(inter$pt_dif)] < 0){
cleanse$win[i + length(inter$pt_dif)] <- 0
}
}
}
cleanse <- cleanse[c(5,1,2,3,4,6)]
head(cleanse)
## date team opponent location ptdif
## 1 Tue, Oct 17, 2017 Cleveland Cavaliers Boston Celtics H 3
## 2 Tue, Oct 17, 2017 Golden State Warriors Houston Rockets H -1
## 3 Wed, Oct 18, 2017 Boston Celtics Milwaukee Bucks H -8
## 4 Wed, Oct 18, 2017 Dallas Mavericks Atlanta Hawks H -6
## 5 Wed, Oct 18, 2017 Detroit Pistons Charlotte Hornets H 12
## 6 Wed, Oct 18, 2017 Indiana Pacers Brooklyn Nets H 9
## win
## 1 1
## 2 0
## 3 0
## 4 0
## 5 1
## 6 1
Now we’re ready to start making predictions! The first step is creating two new columns: one for simulated score differential and one for win probability. We can start out with the info we already have as a base. In every simulated season we perform, the simulated score differential for a past game will be the same. This of course means the win probability is 1 or 0, since it has already happened.
cleanse$sim_ptdif <- cleanse$ptdif
cleanse$win_prob <- cleanse$win
Now we define a function that takes in a team, opponent, and location, then uses our two linear models to predict the score differential and win probability. Check out our other piece on how we make the linear models!
ptdif_call <- function(home,away,HNA){
arr <- c(0,0)
r1 <- rankings$yusag_coeff[which(rankings$team == home)]
r2 <- rankings$yusag_coeff[which(rankings$team == away)]
if(HNA == "H"){
pt_dif <- r1 - r2 - coefficients(lm.NBAhoops)[[1]]
}
if(HNA == "N"){
pt_dif <- r1 - r2
}
if(HNA == "A"){
pt_dif <- r1 - r2 + coefficients(lm.NBAhoops)[[1]]
}
arr[1] <- pt_dif
prob <- 1 / (1+ exp(- coefficients(glm.pointspread)[[2]] * pt_dif))
arr[2] <- prob
return(arr)
}
The next thing we do is call this function for each game that has not yet happened to fill out the empty entries in our predicted point differential and win probability
for(i in 1:(length(cleanse$ptdif))){
if(is.na(cleanse$ptdif[i])){
cleanse$sim_ptdif[i] <- ptdif_call(cleanse$team[i], cleanse$opponent[i], cleanse$location[i])[1]
cleanse$win_prob[i] <- ptdif_call(cleanse$team[i], cleanse$opponent[i], cleanse$location[i])[2]
}
}
In order to find our probabilites, we need to define a mostly empty data frame with each team seperated by conference. This is important because until the NBA just lets the 16 best teams in the playoffs, we’ll have western conference snubs that would make it in the east.
west <- c("Golden State Warriors", "Dallas Mavericks", "Memphis Grizzlies", "Phoenix Suns", "Sacramento Kings", "San Antonio Spurs", "Utah Jazz", "Los Angeles Lakers","Oklahoma City Thunder", "Minnesota Timberwolves", "New Orleans Pelicans", "Denver Nuggets", "Houston Rockets", "Los Angeles Clippers", "Portland Trail Blazers")
east <- setdiff(rankings$team, west)
wins_current <- rep(NA, 15)
wins_projected <- rep(NA, 15)
playoff_prob <- rep(NA, 15)
top_four_prob <- rep(NA, 15)
top_two_prob <- rep(NA, 15)
top_seed_prob <- rep(NA,15)
west_data <- data.frame(west,wins_current,wins_projected,playoff_prob, top_four_prob, top_two_prob, top_seed_prob, stringsAsFactors = FALSE)
east_data <- data.frame(east,wins_current,wins_projected,playoff_prob, top_four_prob, top_two_prob, top_seed_prob, stringsAsFactors = FALSE)
#This will all be filled in eventually
west_data
## west wins_current wins_projected playoff_prob
## 1 Golden State Warriors NA NA NA
## 2 Dallas Mavericks NA NA NA
## 3 Memphis Grizzlies NA NA NA
## 4 Phoenix Suns NA NA NA
## 5 Sacramento Kings NA NA NA
## 6 San Antonio Spurs NA NA NA
## 7 Utah Jazz NA NA NA
## 8 Los Angeles Lakers NA NA NA
## 9 Oklahoma City Thunder NA NA NA
## 10 Minnesota Timberwolves NA NA NA
## 11 New Orleans Pelicans NA NA NA
## 12 Denver Nuggets NA NA NA
## 13 Houston Rockets NA NA NA
## 14 Los Angeles Clippers NA NA NA
## 15 Portland Trail Blazers NA NA NA
## top_four_prob top_two_prob top_seed_prob
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## 7 NA NA NA
## 8 NA NA NA
## 9 NA NA NA
## 10 NA NA NA
## 11 NA NA NA
## 12 NA NA NA
## 13 NA NA NA
## 14 NA NA NA
## 15 NA NA NA
Here we start to fill in our data frame. We add in the YUSAG coefficients. We also find the current wins and expected wins by summing up the win and win probability columns, respectively
west_data$yusag_coeff <- rep(NA, 15)
east_data$yusag_coeff <- rep(NA, 15)
rankings$team <- as.character(rankings$team)
for(i in 1:15){
west_data$wins_current[i] <- sum(cleanse$win[cleanse$team == west_data$west[i]], na.rm = TRUE)
east_data$wins_current[i] <- sum(cleanse$win[cleanse$team == east_data$east[i]], na.rm = TRUE)
west_data$wins_projected[i] <- sum(cleanse$win_prob[cleanse$team == west_data$west[i]], na.rm = TRUE)
east_data$wins_projected[i] <- sum(cleanse$win_prob[cleanse$team == east_data$east[i]], na.rm = TRUE)
west_data$yusag_coeff[i] <- rankings$yusag_coeff[which(rankings$team == west_data$west[i])]
east_data$yusag_coeff[i] <- rankings$yusag_coeff[which(rankings$team == east_data$east[i])]
}
east_data <- east_data[c(1,2,3,8,4,5,6,7)]
west_data <- west_data[c(1,2,3,8,4,5,6,7)]
Now it’s time to simulate seasons! We will create a new data frame for each conference where we tally up the data for each simulated season ie count how many of the seasons a team makes the playoffs.
experimental_east <- data.frame(east, stringsAsFactors = FALSE)
experimental_west <- data.frame(west, stringsAsFactors = FALSE)
experimental_east$sample_wins <- rep(NA, 15)
experimental_west$sample_wins <- rep(NA, 15)
experimental_east$playoff_counts <- rep(0, 15)
experimental_west$playoff_counts <- rep(0, 15)
experimental_east$four_counts <- rep(0, 15)
experimental_west$four_counts <- rep(0, 15)
experimental_east$two_counts <- rep(0, 15)
experimental_west$two_counts <- rep(0, 15)
experimental_east$one_counts <- rep(0, 15)
experimental_west$one_counts <- rep(0, 15)
Now we simulate N seasons. I like 15000. The more seasons we simulate, the closer we will get to the actual probability that our model suggests.
N <- 15000
cleanse$sim_win <- cleanse$win
min_game <- min(which(is.na(cleanse$sim_win)))
for(j in 1:N){
#To simulate a season, we generate 1230 random numbers between 1 and 0, one for each game of the season
play_games <- runif(1230)
for(i in (min_game):1230){
#For a given game, the winner is decided by wheather or not the random number generated is less than the win probability
if(play_games[i] < cleanse$win_prob[i]){
cleanse$sim_win[i] <- 1
cleanse$sim_win[i + 1230] <- 0
}
else{
cleanse$sim_win[i] <- 0
cleanse$sim_win[i + 1230] <- 1
}
}
for(k in 1:15){
experimental_east$sample_wins[k] <- sum(cleanse$sim_win[cleanse$team == experimental_east$east[k]])
experimental_west$sample_wins[k] <- sum(cleanse$sim_win[cleanse$team == experimental_west$west[k]])
}
#The way we collect data is by sorting by wins in a simulated season, then adding a count to the team that finishes either 1, top 2, top 4, or top 8 in each conference
experimental_east <- experimental_east[order(experimental_east$sample_wins,decreasing = TRUE),]
experimental_west <- experimental_west[order(experimental_west$sample_wins,decreasing = TRUE),]
experimental_east$playoff_counts[1:8] <- experimental_east$playoff_counts[1:8] + 1
experimental_west$playoff_counts[1:8] <- experimental_west$playoff_counts[1:8] + 1
experimental_east$four_counts[1:4] <- experimental_east$four_counts[1:4] + 1
experimental_west$four_counts[1:4] <- experimental_west$four_counts[1:4] + 1
experimental_east$two_counts[1:2] <- experimental_east$two_counts[1:2] + 1
experimental_west$two_counts[1:2] <- experimental_west$two_counts[1:2] + 1
experimental_east$one_counts[1] <- experimental_east$one_counts[1] + 1
experimental_west$one_counts[1] <- experimental_west$one_counts[1] + 1
if(j %% 1000 == 0){
print(j)
}
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
## [1] 11000
## [1] 12000
## [1] 13000
## [1] 14000
## [1] 15000
Now that we have simulated N seasons, we can add the probabilities to our data frame by dividing the number of occurences by the number of trials.
for(i in 1:15){
west_data$playoff_prob[i] <- experimental_west$playoff_counts[which(experimental_west$west == west_data$west[i])] / N
east_data$playoff_prob[i] <- experimental_east$playoff_counts[which(experimental_east$east == east_data$east[i])] / N
west_data$top_four_prob[i] <- experimental_west$four_counts[which(experimental_west$west == west_data$west[i])] / N
east_data$top_four_prob[i] <- experimental_east$four_counts[which(experimental_east$east == east_data$east[i])] / N
west_data$top_two_prob[i] <- experimental_west$two_counts[which(experimental_west$west == west_data$west[i])] / N
east_data$top_two_prob[i] <- experimental_east$two_counts[which(experimental_east$east == east_data$east[i])] / N
west_data$top_seed_prob[i] <- experimental_west$one_counts[which(experimental_west$west == west_data$west[i])] / N
east_data$top_seed_prob[i] <- experimental_east$one_counts[which(experimental_east$east == east_data$east[i])] / N
}
Now we just have some last minute cleaning to do, including rounding some values, sorting our results, and writing these results to a csv file
west_data <- west_data[order(west_data$wins_projected, decreasing = TRUE),]
east_data <- east_data[order(east_data$wins_projected, decreasing = TRUE),]
colnames(west_data)[1] <- "team"
colnames(east_data)[1] <- "team"
west_data$playoff_prob <- round(west_data$playoff_prob, digits = 3)
east_data$playoff_prob <- round(east_data$playoff_prob, digits = 3)
west_data$top_seed_prob <- round(west_data$top_seed_prob, digits = 3)
east_data$top_seed_prob <- round(east_data$top_seed_prob, digits = 3)
west_data$top_two_prob <- round(west_data$top_two_prob, digits = 3)
east_data$top_two_prob <- round(east_data$top_two_prob, digits = 3)
west_data$top_four_prob <- round(west_data$top_four_prob, digits = 3)
east_data$top_four_prob <- round(east_data$top_four_prob, digits = 3)
west_data$wins_projected <- round(west_data$wins_projected, digits = 1)
east_data$wins_projected <- round(east_data$wins_projected, digits = 1)
west_data$yusag_coeff <- round(west_data$yusag_coeff, digits = 2)
east_data$yusag_coeff <- round(east_data$yusag_coeff, digits = 2)
write.csv(west_data, "WEST_RANKINGS.csv", row.names = FALSE)
write.csv(east_data, "EAST_RANKINGS.csv", row.names = FALSE)
#Let's look at the results
east_data
## team wins_current wins_projected yusag_coeff
## 1 Toronto Raptors 52 61.3 7.84
## 2 Boston Celtics 47 54.4 3.28
## 3 Philadelphia 76ers 39 48.1 2.60
## 6 Cleveland Cavaliers 41 47.1 -0.22
## 4 Washington Wizards 40 46.6 1.05
## 5 Indiana Pacers 41 46.0 0.71
## 7 Miami Heat 38 44.1 -0.26
## 8 Milwaukee Bucks 37 42.6 -0.83
## 10 Detroit Pistons 31 37.1 -1.54
## 9 Charlotte Hornets 30 35.7 -1.08
## 11 New York Knicks 26 30.4 -3.33
## 15 Chicago Bulls 24 27.7 -6.06
## 12 Brooklyn Nets 23 27.0 -4.22
## 13 Orlando Magic 21 25.9 -4.74
## 14 Atlanta Hawks 20 23.0 -5.65
## playoff_prob top_four_prob top_two_prob top_seed_prob
## 1 1.000 1.000 1.000 0.999
## 2 1.000 1.000 0.999 0.001
## 3 1.000 0.809 0.001 0.000
## 6 1.000 0.558 0.000 0.000
## 4 1.000 0.399 0.000 0.000
## 5 1.000 0.212 0.000 0.000
## 7 1.000 0.020 0.000 0.000
## 8 0.996 0.002 0.000 0.000
## 10 0.004 0.000 0.000 0.000
## 9 0.000 0.000 0.000 0.000
## 11 0.000 0.000 0.000 0.000
## 15 0.000 0.000 0.000 0.000
## 12 0.000 0.000 0.000 0.000
## 13 0.000 0.000 0.000 0.000
## 14 0.000 0.000 0.000 0.000
west_data
## team wins_current wins_projected yusag_coeff
## 13 Houston Rockets 56 65.9 8.79
## 1 Golden State Warriors 53 61.7 7.37
## 15 Portland Trail Blazers 44 50.2 2.48
## 9 Oklahoma City Thunder 43 48.3 2.88
## 6 San Antonio Spurs 41 47.3 2.93
## 10 Minnesota Timberwolves 40 47.0 2.40
## 7 Utah Jazz 40 46.9 3.22
## 11 New Orleans Pelicans 40 46.2 0.58
## 14 Los Angeles Clippers 37 43.5 0.73
## 12 Denver Nuggets 38 43.0 0.69
## 8 Los Angeles Lakers 31 36.3 -1.61
## 2 Dallas Mavericks 22 27.4 -2.06
## 5 Sacramento Kings 23 25.8 -7.27
## 3 Memphis Grizzlies 19 22.4 -5.21
## 4 Phoenix Suns 19 21.2 -9.10
## playoff_prob top_four_prob top_two_prob top_seed_prob
## 13 1.000 1.000 1 0.996
## 1 1.000 1.000 1 0.004
## 15 1.000 0.906 0 0.000
## 9 0.995 0.505 0 0.000
## 6 0.970 0.231 0 0.000
## 10 0.953 0.144 0 0.000
## 7 0.952 0.145 0 0.000
## 11 0.892 0.067 0 0.000
## 14 0.160 0.001 0 0.000
## 12 0.078 0.000 0 0.000
## 8 0.000 0.000 0 0.000
## 2 0.000 0.000 0 0.000
## 5 0.000 0.000 0 0.000
## 3 0.000 0.000 0 0.000
## 4 0.000 0.000 0 0.000
Be sure to follow @YUSAG_NBA on twitter for the latest updates. Thanks for reading!