SHARE

TWEET

# ass

a guest
Sep 11th, 2019
135
Never

**Not a member of Pastebin yet?**

**, it unlocks many cool features!**

__Sign Up__- ---
- title: "NBA_ONLY_CLEANING"
- author: "Malte HĂžjmark-Bertelsen"
- date: "7/16/2019"
- output: html_document
- ---
- ```{r setup, include=FALSE}
- knitr::opts_chunk$set(echo = TRUE)
- library(pacman)
- p_load(tidyverse, corrgram, lubridate, gridExtra, caret, tidyverse, BSDA, rpart, rpart.plot, C50, randomForest, class, pastecs, propagate, utils, clusterGeneration)
- ```
- ```{r}
- setwd("/cloud/project")
- nba <- read.csv("shot_logs.csv")
- nba_raw <- nba
- ```
- ```{r}
- ```
- ```{r}
- #Justyfing that shot results and fgm are the same data
- ls.str(nba)
- nba$SHOT_RESULT <- nba$SHOT_RESULT == "made"
- nba$FGM <- nba$FGM == 1
- summary(nba$SHOT_RESULT == nba$FGM)
- summary(nba)
- #SHOT_RESULTS are equal to FGMs we can therefore remove one of these variables as they say the exact same thing:
- nba <- dplyr::select(nba, -SHOT_RESULT)
- ##MATCHUP handling
- #Taking the date out of MATCHUP
- nba$DATE <- gsub("-.*", "", nba$MATCHUP)
- #Creating a DATE column
- nba$DATE <- as.Date(nba$DATE, format="%b %d, %Y")
- #Taking the teams and making a shooting/defending team column.
- nba$ONLYTEAMS <- gsub(".*-", "", nba$MATCHUP)
- nba$SHOOTING_TEAM <- str_extract(nba$ONLYTEAMS, regex("\\w+"))
- nba$DEFENDING_TEAM <- str_extract(nba$ONLYTEAMS, regex("[.|@] \\w+"))
- nba$DEFENDING_TEAM <- gsub("[.|@] ", "", nba$DEFENDING_TEAM)
- nba <- dplyr::select(nba, -ONLYTEAMS)
- #Changing FGM into 1s and 0s
- nba$FGM <- ifelse(nba$FGM ==T, 1, 0)
- #Changing GAME_CLOCK and SHOT_CLOCK into minutes
- nba$GAME_CLOCK <- period_to_seconds(ms(nba$GAME_CLOCK)) / 60
- nba$SHOT_CLOCK <- nba$SHOT_CLOCK/60
- #Creating a TOTAL_GAMETIME variable in minutes
- nba$TOTAL_GAMETIME <- ifelse(
- as.numeric(as.character(nba$PERIOD)) == 1,
- 12 - nba$GAME_CLOCK,
- ((as.numeric(as.character(nba$PERIOD)) - 1) * 12) + (12 - nba$GAME_CLOCK)
- )
- #Changing shooting player name format into lastname, firstname.
- #nba$player_name <- as.person(nba$player_name)
- #nba$player_name <- format(nba$player_name, include=c("family","given"), braces=list(family=c("",",")))
- ```
- ```{r}
- #Include in outliers section #To check if these observations have some trends or patterns in other variables, then we make a new dataframe including these observations
- nba_touchtime_outliers <- nba[which(nba$TOUCH_TIME <= 0),]
- #First we look at game id, but from the plots they look more or less the same
- plot(nba_touchtime_outliers$GAME_ID)
- plot(nba$GAME_ID)
- #Next we look at matchup, which looks the same
- plot(nba_touchtime_outliers$MATCHUP)
- plot(nba$MATCHUP)
- #Next we look at location, which look the same
- plot(nba_touchtime_outliers$LOCATION)
- plot(nba$LOCATION)
- #Next we look at win or lose, where there is a small difference, since the outliers have a higher proportion being lose and the original data has a higher proportion being win, but as both are very much close to 50%, then the difference will not have a big impact on the original data
- plot(nba_touchtime_outliers$W)
- plot(nba$W)
- #Next we look at final margin, which looks more or less the same
- hist(nba_touchtime_outliers$FINAL_MARGIN)
- hist(nba$FINAL_MARGIN)
- #Next we look at shot number, which looks the same
- hist(nba_touchtime_outliers$SHOT_NUMBER)
- hist(nba$SHOT_NUMBER)
- #Next we look at period. There is a small difference, but nothing important
- plot(nba_touchtime_outliers$PERIOD)
- plot(nba$PERIOD)
- #next we look at game clock. This looks more or less the same
- hist(nba_touchtime_outliers$GAME_CLOCK)
- hist(nba$GAME_CLOCK)
- #Next we look at shot clock. This looks weird, since the the touch time thas is negative or 0 has a much higher proportion where the shot clock is high compared to the original dataset. This could potentially be because that if the ball hist the backboard and doesn't go in, then it resets the shot clock, but if another guy jumps up and dunks it while it is in the air, then he would have a very low touch time, potentially counted as 0, and the shot clock would have reset so that would be high. This is called a putback
- par(mfrow=c(4,2))
- hist(nba_touchtime_outliers$SHOT_CLOCK, breaks = 20, main = "Erroneous Data", xlab = "Shot Clock", ylab = "Frequency")
- hist(nba$SHOT_CLOCK, breaks = 20, main = "Erroneous Data", xlab = "SHot Clock", ylab = "Frequency")
- #If it is a putback, then the shotclock should have reset and be very high. Since the ball could be in the air for maybe one second we will look at the data that has shot clock higher than or equal to 23 (0.38 minutes) and that has a touch time of 0, since these are the ones potentially a putback which happens a lot
- length(which(nba_touchtime_outliers$TOUCH_TIME == 0))
- length(which(nba_touchtime_outliers$TOUCH_TIME == 0 & nba_touchtime_outliers$SHOT_CLOCK >= 0.38))
- #This shows that out of the 3046 observations with touchtime equal to 0, then 1269 had a touch time equal to 0 and a shot clock of 23 or higher. This does to some extend go against our theory, since the shot clock would have to be high for it to be a putbag, but the 1269 could still potentially be putbags. If it is a putback, then it should be a 2 pointer and it should be close to the hoop.
- putback <- nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_CLOCK >= 0.38),]
- hist(putback$SHOT_DIST, breaks = 100)
- #This shows that the biggest proportion of the data has a distance close to 0, so it could potentially be that a guy jumped towards the hoop and did a putback and the distance was measured from where he jumped, but also that there is a spike just a bit further out which goes against our theory also.
- plot(as.factor(putback$PTS_TYPE))#This shows that the biggest proportion of the shots where 2-pointers, which make sense if it is a putback too
- #Next thing is dribbles, where all of the outliers for touch time has 0 dribbles, but the original dataset has also the highest amount around 0, but has a decending trend to around 20.
- hist(nba_touchtime_outliers$DRIBBLES, breaks = 30, main = "Erroneous Data", xlab = "Dribbles", ylab = "Frequency")
- hist(nba$DRIBBLES, breaks = 30, main = "Original Data", xlab = "Dribbles", ylab = "Frequency")
- length(which(nba_touchtime_outliers$DRIBBLES > 0))
- hist(nba_touchtime_outliers$SHOT_DIST)
- length(which(nba_touchtime_outliers$SHOT_DIST > 7.12))
- 780/3358*100
- nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_DIST == 20),]
- plot(nba_touchtime_outliers$FGM)
- length(which(nba_touchtime_outliers$TOUCH_TIME == 0))
- length(which(nba_touchtime_outliers$SHOT_DIST <= 7.12))
- summary(nba_touchtime_outliers$SHOT_DIST)
- attach(nba_touchtime_outliers)
- length(which(TOUCH_TIME < 0 & DRIBBLES == 0))
- length(which(TOUCH_TIME < 0))
- nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_DIST == 44.8),]
- hist(nba_touchtime_outliers$CLOSE_DEF_DIST)
- length(which(nba_touchtime_outliers$TOUCH_TIME < 0))
- hist(putback$SHOT_DIST, breaks = 100)
- hist(putback$DRIBBLES)
- summary(putback$SHOT_DIST)
- hist(putback$SHOT_DIST, breaks = 100)
- alley_oop <- nba_touchtime_outliers[which(nba_touchtime_outliers$DRIBBLES == 0 & nba_touchtime_outliers$SHOT_DIST <7.12 & nba_touchtime_outliers$SHOT_CLOCK < 0.38),]
- length(which(missing_df$SHOT_DIST > 7.12 & missing_df$TOUCH_TIME == 0))
- length(which(putback$SHOT_DIST >= 7.12))
- #Looking at how big the proportion of dribbles being 0 of the outliers and the original dataset
- length(which(nba_touchtime_outliers$DRIBBLES == 0))#This being 3347
- length(which(nba$DRIBBLES == 0))#This being 63195
- 3347/63195*100#This mean that if we delete the outliers then we would delete 5.3% of the original dataset that has dribbles being 0. The big proportion of the outliers having 0 dribbles support that the outliers must have been an emidiate shot when getting the ball, so it could be a putback or a quick shot from getting the ball, meaning that the touchtime could have been recorded as 0
- #Next we look at shot distance, that shows that the outliers have a higher proportion being close to the hoop, while the original dataset is more focused around 5 and 25 feet. This again could support the fact that it is a putback that happens close to the hoop
- hist(nba_touchtime_outliers$SHOT_DIST, breaks = 100, main = "Erroneous Data", xlab = "Shot Distance", ylab = "Frequency")
- hist(nba$SHOT_DIST, breaks = 100, main = "Original Data", xlab = "Shot Distance", ylab = "Frequency")
- #Next is pts_type, which shows that the outliers again have a higher proportion being 2 point shots compared to 3 point shots compared to the original data, which would happen if it is putbacks or emidiate shots to get a quick 3
- plot(as.factor(nba_touchtime_outliers$PTS_TYPE), main = "Erroneous Data", xlab="Points Type", ylab="Frequency")
- plot(as.factor(nba$PTS_TYPE), main = "Original Data", xlab="Points Type", ylab="Frequency")
- #Next is closest defender, which looks kinda the same
- plot(nba_touchtime_outliers$CLOSEST_DEFENDER)
- plot(nba$CLOSEST_DEFENDER)
- #Next is the closest defender distance, which looks kinda the same
- hist(nba_touchtime_outliers$CLOSE_DEF_DIST, breaks = 100)
- hist(nba$CLOSE_DEF_DIST, breaks = 100)
- mean(nba_touchtime_outliers$CLOSE_DEF_DIST)#Mean is 3.29
- mean(nba$CLOSE_DEF_DIST)#Mean is 4.12
- #So there is a small difference with the outliers having a closer defender distance
- #Next is field goal made, which looks kinda the same. This is weird since putbacks are more of a sure thing than shooting, so we would expect that for the outliers there would be higher proportion of hitting the field goal than missing, which is not the case
- plot(nba_touchtime_outliers$FGM)
- plot(nba$FGM)
- #Next is points made, which has a small difference in the fact that it has a higher proportion being 0 or 2, than the original dataset
- hist(nba_touchtime_outliers$PTS, breaks = 3)
- hist(nba$PTS, breaks = 3)
- #Next is player ID and since player name is the same we wont do that. It shows more or less the same, but that the outliers are more present for two specific players
- plot(nba_touchtime_outliers$player_id)
- plot(nba$player_id)
- ```
- ```{r}
- #Indetifying missing data
- summary(nba)# Which shows that there are missing data in the shot clock.
- #Making a variable for the observations with missing data
- nba$missing1 <- ifelse(is.na(nba$SHOT_CLOCK), 1, 0)
- #Making a datafram for the missing data
- missing_df <- nba[nba$missing1==1,]
- 5567/128069*100#4.35% missing data out of all of the data
- summary(missing_df)
- ```
- ```{r}
- ###Looking into if there are any patterns or trends in the missing data, to see if there are any danger in deleting the missing data
- ```
- ```{r}
- #First we look at game clock
- par(mfrow = c(1,2))
- hist(missing_df$GAME_CLOCK, breaks = 50)#Showing the histogram for the game clock of the missing data
- hist(nba$GAME_CLOCK, breaks = 50)#showing the histogram for the game clock of the original data
- #It seems like there is a trend in the missing data here where there are more of the missing data that has a low game clock than the original data. To check this we calculate the percentage of values that has a low game clock (1) compared to the total amount of values for game clock in each data frame
- #Here we check the percentage of game clocks that are less than 1 in the missing data frame
- table(missing_df$GAME_CLOCK <= "1")
- 3651/5567*100#it is 65.58%
- #Here we check the percentage of game clocks that are less than 1 in the original data frame
- table(nba$GAME_CLOCK<="1")
- 12625/128069*100#It is 9.86%
- #This shows that the missing data has a much higher percentage of low game clocks than the original data, meaning that if we delete the missing data, then there might be some bias in the left over data in terms of game clock, since much of the lower game clocks will be deleted compared to the higher game clocks.
- #Furthermore, this might be explained by the fact that if the game clock is less than 24 seconds or 0.4 minutes, and the shot clock is reset, then it wont show, since the game clock automatically becomes the shot clock
- #From looking at videos https://www.youtube.com/watch?v=w_Bk1quPHhY, then we can see that if it is in the final seconds of the match, then there is no shot clock, which makes sense, since there the game clock becomes the shot clock.
- #Checking if it is always the case that the game clock is higher than the shot clock
- length(which(nba$GAME_CLOCK < nba$SHOT_CLOCK))
- # this shows that there are 13 observations out of the 128069, where the game clock is less than the shot clock, which indicates that it doesn't really happen and that, which supports that there won't be a shot clock if the game clock is low
- #Replacing SHOT_CLOCK NAs with GAME_CLOCK time points where its below 0.4 (corresponding to 24 seconds)
- nba[which(is.na(nba$SHOT_CLOCK) & nba$GAME_CLOCK<=0.4),]$SHOT_CLOCK = nba[which(is.na(nba$SHOT_CLOCK) & nba$GAME_CLOCK<=0.4),]$GAME_CLOCK
- #Making a new dataframe for the missing data
- nba$missing1 <- ifelse(is.na(nba$SHOT_CLOCK), 1, 0)
- missing_df <- nba[nba$missing1==1,]
- ```
- ```{r}
- #First looking at the game_id
- par(mfrow=c(1,2))
- plot(missing_df$GAME_ID)#Showing the game id and how many missing datapoints there where in it. This indicates that there are some game_id that has very high missing values and actually a couple of matches where there where almost all of the observations for that match didn't have shotclock
- plot(nba$GAME_ID)
- df_GAME_ID <- data.frame(table(missing_df$GAME_ID))#new datafram for the game id's showing the frequency of missing data
- #showing the summary of the frequency of missing data in the game_id
- summary(df_GAME_ID$Freq)
- #To figure out how many game_id that has significant higher amount of missing data than the rest, then we make use of the IQR, but first we visualize it with a boxplot
- boxplot(df_GAME_ID$Freq)
- #Now we calculate it
- IQR(df_GAME_ID$Freq)
- 5+1.5*IQR(df_GAME_ID$Freq)#This shows the amount of missing data that a game_id must have (8) before it becomes an outlier.
- #Now we figure out how many has this
- length(which(df_GAME_ID$Freq>8))
- 24/904*100 #as we have 24 game_id with significant higher amount of missing data then the rest, then we should be aware of deleting the missing data would have a higher effect on those game_id then the rest. Alternative, by deleting those 24 games that already had a significant higher amount of missing data would be to delete 2.65% of the games from the dataset
- ```
- ```{r}
- #Next we look at location
- par(mfrow=c(1,2))
- plot(missing_df$LOCATION)#showing the distribution of A and H for missing data
- plot(nba$LOCATION)#showing the distribution of A and H for all the data
- #To get more in detail we look at the count
- table(missing_df$LOCATION)/2013*100#This shows that in the missing data then 49.98% of them where away and 51.02% where home
- table(nba$LOCATION)/128069*100#This shows that in all the data then 50.08% where away and 49.92% where home
- #All in all the missing data seems like it is random for the location variable by having the same proportion being away and home as in the original dataset
- ```
- ```{r}
- #next we look at win or lose
- par(mfrow=c(1,2))
- plot(missing_df$W)#Shows the distribution of w and l for the missing data
- plot(nba$W)#shows the distribution of w and l for the original data
- #These seem to be very simular, but to check further then we look at the percentages
- table(missing_df$W)/2013*100#Shows that 49.58% where L and 50.42% where W
- table(nba$W)/128069*100#Shows that 49.56% where L and 50.44% where W
- #All in all the missing data and the original data has the same proportion of win and lose
- ```
- ```{r}
- #next we check final margin
- hist(missing_df$FINAL_MARGIN)#Shows the histogram for the final margin in the missing data
- hist(nba$FINAL_MARGIN)#Shows the histogram for the final margin for the original dataset
- #They seem to have small difference, but to check further we can use the mean and standard deviation as measures
- #First we calculate it for the missing data
- mean(missing_df$FINAL_MARGIN)#It is -0.028
- sd(missing_df$FINAL_MARGIN)#it is 10.34
- #Now for the original data
- mean(nba$FINAL_MARGIN)#it is 0.209
- sd(nba$FINAL_MARGIN)#it is 13.233
- #All in all the histograms seem have a mirrored distribution, but the mean and standard deviation is more or less the same, meaning that it seems that the missing data is randomly taken for this variable
- ```
- ```{r}
- #Next we check for shot number
- hist(missing_df$SHOT_NUMBER)#Shows the histogram for the shot number of the missing data
- hist(nba$SHOT_NUMBER)#Shows the histogram for the shot number of the original data
- #The histograms seem to be more or less the same, but we can check this further with some values for center and spread
- #First we look at the missing data
- mean(missing_df$SHOT_NUMBER)#it is 4.95
- sd(missing_df$SHOT_NUMBER)#it is 3.70
- #Then we look at the original data
- mean(nba$SHOT_NUMBER)#the mean is 6.51
- sd(nba$SHOT_NUMBER)#it is 4,71
- #All in all it seems like the missing data is also randomly taken from the original data, but that the mean and spread of the missing data is a bit smaller than the original data
- ```
- ```{r}
- #Next we look at period
- plot(missing_df$PERIOD, main="Missing Data", xlab="Period", ylab="Frequency")#Shows the plot for period of the missing data
- plot(nba$PERIOD, main="Original data", xlab="Period", ylab="Frequency")#Shows the plot for period of the original data
- #These seem to be a bit different, since the missing data has a higher proportion being in period 1 and 2, while the original data is more or less the same for the first four periods
- ```
- ```{r}
- #Since shot clock is the what the missing data is missing, then we wont check that but go to dribbles instead
- hist(missing_df$DRIBBLES, breaks = 30)#Showing the histogram for the variable dribbles in the missing data
- hist(nba$DRIBBLES, breaks = 30)#Showing the histogram for the variable dribbles in the original data
- #It seems likes it is more or less the same, but to check further we look at the mean and standard deviation
- barplot(table(missing_df$DRIBBLES)/2013*100)#This shows the barplot of the percentages of dribbles for missing data
- barplot(table(nba$DRIBBLES)/128069*100)#This shows the barplot of the percentages of dribbles for original data
- #We can see that the missing data has a higher percentage of 0 dribbles than the original dataset
- table(missing_df$DRIBBLES)/2013*100
- table(nba$DRIBBLES)/128069*100
- #From the tables we can see that it is only a 3% point difference, and since the plots seem to be the same there is not systematic here
- ```
- ```{r}
- #Next we look at touch time This on looks weird because of the huge outliers of negative touch time, which shouldn't be possible
- par(mfrow=c(1,2))
- hist(missing_df$TOUCH_TIME, main = "Missing Data", xlab = "Touch Time", ylab = "Frequency")
- hist(nba$TOUCH_TIME, main = "Original Data", xlab = "Touch Time", ylab = "Frequency")
- #To look further into it we calculate the mean and spread
- mean(missing_df$TOUCH_TIME)#It is 2.42
- sd(missing_df$TOUCH_TIME)#It is 4.53
- mean(nba$TOUCH_TIME)#It is 2.77
- sd(nba$TOUCH_TIME)#It is 3.04
- #Both histograms from the missing data section seems weird and it seems like there is some values that are
- range(nba$TOUCH_TIME)#This shows that the touch time is ranging from -163.6 to 24.9. This seems weird since touch time shouldn't be less than 0 and also not 0, but also it shouldn't be higher than the shotclock which in NBA is 24
- #To see how many observations there are where the touch time is less than or equal to 0
- length(which(nba$TOUCH_TIME <= 0))#Shows that there are 3358 observations with a touchtime equal to or less than 0
- ```
- ```{r}
- #Next we look at shot distance, which looks the same
- hist(missing_df$SHOT_DIST)
- hist(nba$SHOT_DIST)
- #To look further into it we calculate the mean and standard deviation
- mean(missing_df$SHOT_DIST)#It is 13.69
- sd(missing_df$SHOT_DIST)#It is 8.85
- mean(nba$SHOT_DIST)#it is 13.57
- sd(nba$SHOT_DIST)#it is 8.89
- boxplot(missing_df$SHOT_DIST)
- boxplot(nba$SHOT_DIST)
- #all in all it seems like there are no pattern or trend here
- ```
- ```{r}
- #Next we look at pts type
- plot(as.factor(missing_df$PTS_TYPE))
- plot(as.factor(nba$PTS_TYPE))
- #It seems like there is not difference between the original and missing data, which indicates no pattern
- #To look futher into this we will look at the percentage of 2 and 3 for both
- table(missing_df$PTS_TYPE)/2013*100#There are 73.82% of 2 and 26.18% of 3
- table(nba$PTS_TYPE)/128069*100#There are 73.53% of 2 and 26.47% of 3
- #All in all there are no difference in the data with this variable for the missing data and the original data
- ```
- ```{r}
- #Next we look at closest defender, and since defender and defender ID is the same, then we won't look at defender ID #Need help to find out how we visualize this
- plot(missing_df$CLOSEST_DEFENDER)
- plot(nba$CLOSEST_DEFENDER)
- #This shows that there are some defending players there are present a lot more in the missing data than others. It also shows that the amount of times are no way near the same for the original data, meaning that defending players most likely wouldn't lose too much of their data if the missing data were deleted
- ```
- ```{r}
- #Next we look at closest defender distance, which shows that the missing data and the original data looks the same in closest defender distance
- hist(missing_df$CLOSE_DEF_DIST, breaks = 40)
- hist(nba$CLOSE_DEF_DIST, breaks = 40)
- ```
- ```{r}
- #Next is field goal made, which shows that there is a small difference in proportion distribution between 0 and 1 for the missing data and the original data
- plot(missing_df$FGM)
- plot(nba$FGM)
- #To look further into this we look at the percentages
- table(missing_df$FGM)/2013*100#Shows that 55.74% of the missing data was miss and that 44.26% was a hit
- table(nba$FGM)/128069*100#Shows that 54.79% of the original data was miss and that 45.21% was a hit
- #This shows that there isn't any difference and that no patterns exist here either.
- ```
- ```{r}
- #Next we look at points.
- hist(missing_df$PTS, breaks = 3)
- hist(nba$PTS, breaks = 3)
- #This looks very much the same, but to further check we look at the mean and standard deviation
- mean(missing_df$PTS)#This is 0.98
- sd(missing_df$PTS)#This is 1.13
- mean(nba$PTS)#This is 0.997
- sd(nba$PTS)#This is 1.13
- #This shows that there is no trend or pattern here
- ```
- ```{r}
- #Next we look at player, by the ID
- plot(missing_df$player_id)
- plot(nba$player_id)
- #Shows that some players a more represented in the missing data than other, but this is also the case in the original data and the missing data is no way near the values of the original data, so most likely there won't be any person effected significantly if the data was removed
- ```
- ```{r}
- hist(year(missing_df$DATE), breaks = 2)
- hist(year(nba$DATE), breaks = 2)
- #This shows that the missing data has a higher proportion of observations from 2015, where the original data has more from 2014. Deleting the missing data would result in a higher difference in the original data then it already is, but it is still only 3358 observations out of 128069
- ```
- ```{r}
- #Next we look shooting team
- par(mfrow=c(2,2))
- plot(missing_df$SHOOTING_TEAM, main= "Missing Data", xlab= "Shooting Team", ylab= "Frequency")
- plot(nba$SHOOTING_TEAM, main= "Original Data", xlab= "Shooting Team", ylab = "Frequency")
- #Here it shows that the original data is more or less even across all shooting team, but in the missing data, there is a match with mich higher missing data
- table(missing_df$SHOOTING_TEAM)
- #This shows that it is the LAL team that has a lot of missing data (493).
- length(which(missing_df$SHOOTING_TEAM == "LAL" & missing_df$LOCATION == "H"))
- 323/493*100#65.5% of the time, then the missing data for LAL is when they are the shooting team at home
- ```
- ```{r}
- #Next we look at defending team
- plot(missing_df$DEFENDING_TEAM, main = "Missing Data", xlab="Defending Team", ylab="Frequency")
- plot(nba$DEFENDING_TEAM, main = "Original Data", xlab = "Defending Team", ylab = "Frequency")
- #Again the original data is more or less the same distribution, where the missing data has one team that is missing much more data
- table(missing_df$DEFENDING_TEAM)#Shows that LAL is appering 464
- #Again we see that it is LAL that has the highest amount of missing data when they are defending. Also all the teams that had hight or low amount of missing data, then has the same as defending team. Could mean that their camaras are bad or the system is faulty.
- length(which(missing_df$DEFENDING_TEAM == "LAL" & missing_df$LOCATION == "A"))
- 312/464*100#67.2% of the time that LAL has missing data as the defending team it is home
- #The percentage of missing data from LAL that came from them being home
- (323+312)/(493+464)*100#66.35%
- #Making a data frame for all the missing observations from the team LAL
- nba_LAL_DEF <- missing_df[missing_df$DEFENDING_TEAM == "LAL",]
- nba_LAL_SHO <- missing_df[missing_df$SHOOTING_TEAM == "LAL",]
- nba_LAL <- rbind(nba_LAL_DEF, nba_LAL_SHO)
- #Checking if there is a pattern for LAL
- par(mfrow=c(1,2))
- plot(nba_LAL$LOCATION)
- plot(nba$LOCATION)
- plot(nba_LAL$W)
- plot(nba$W)
- hist(nba_LAL$FINAL_MARGIN)
- hist(nba$FINAL_MARGIN)
- hist(nba_LAL$SHOT_NUMBER)
- hist(nba$SHOT_NUMBER)
- #Here there is a difference in the periods for the LAL to the original
- plot(nba_LAL$PERIOD, main = "Los Angeles Lakers", xlab = "Period", ylab="Frequency")
- plot(nba$PERIOD, main = "Original Data", xlab = "Period", ylab = "Frequency")
- hist(nba_LAL$GAME_CLOCK)
- hist(nba$GAME_CLOCK)
- hist(nba_LAL$DRIBBLES)
- hist(nba$DRIBBLES)
- hist(nba_LAL$TOUCH_TIME)
- hist(nba$TOUCH_TIME, breaks = 100)
- hist(nba_LAL$SHOT_DIST)
- hist(nba$SHOT_DIST)
- plot(nba_LAL$PTS_TYPE)
- plot(nba$PTS_TYPE)
- hist(nba_LAL$CLOSE_DEF_DIST)
- hist(nba$CLOSE_DEF_DIST, breaks = 100)
- plot(nba_LAL$FGM)
- plot(nba$FGM)
- hist(nba_LAL$PTS, breaks = 3)
- hist(nba$PTS, breaks = 3)
- ```
- ```{r}
- #last is game time
- hist(missing_df$TOTAL_GAMETIME)
- hist(nba$TOTAL_GAMETIME)
- #Here we see that the missing data is more focused around lower values (0-20) compared to the original dataset that is more evenly distributed for 0 to 40
- ```
- ```{r}
- #Changing anomalies of a touch time above 24 seconds to 24
- nba$TOUCH_TIME[which(nba$TOUCH_TIME > 24)] <- 24
- #Changing negative touch times to 0.
- nba$TOUCH_TIME[which(nba$TOUCH_TIME < 0)] <- 0
- nba_noremoved <- nba
- #Because we are taking the assumption that a touch time = 0 is alley oops we need to delete shots with a shot distance above the mean, decided by looking at videos at where players shoot from.
- summary(nba[which(nba$TOUCH_TIME == 0),]$SHOT_DIST) #mean value = 7.12
- nba <- nba[-which(nba$TOUCH_TIME == 0 & nba$SHOT_DIST > 7.12),]
- #Because of this assumption we also need to remove any shots with dribbles above 0
- nba <- nba[-which(nba$TOUCH_TIME == 0 & nba$DRIBBLES > 0),]
- #Changing variables into correct classes
- nba$GAME_CLOCK <- as.numeric(nba$GAME_CLOCK)
- nba$player_id <- as.factor(nba$player_id)
- nba$CLOSEST_DEFENDER_PLAYER_ID <- as.factor(nba$CLOSEST_DEFENDER_PLAYER_ID)
- nba$FGM <- as.factor(nba$FGM)
- nba$GAME_ID <- as.factor(nba$GAME_ID)
- nba$DEFENDING_TEAM <- as.factor(nba$DEFENDING_TEAM)
- nba$SHOOTING_TEAM <- as.factor(nba$SHOOTING_TEAM)
- nba$PTS_TYPE <- as.factor(nba$PTS_TYPE)
- nba$FGM <- as.factor(nba$FGM)
- nba$PERIOD <- as.factor(nba$PERIOD)
- ls.str(nba)
- summary(nba)
- ```
- #Newer imputation code
- ```{r}
- nba_keep <- dplyr::select(nba, LOCATION, W, FINAL_MARGIN, SHOT_NUMBER, PERIOD, SHOT_CLOCK, TOUCH_TIME, SHOT_DIST, PTS_TYPE, CLOSE_DEF_DIST, TOTAL_GAMETIME, FGM)
- str(nba_keep)
- nba_keep$PERIOD <- as.numeric(nba$PERIOD)
- nba_keep$SHOT_NUMBER <- as.numeric(nba$SHOT_NUMBER)
- ls.str(nba_keep)
- #Turn LOCATION, W, PTS_TYPE into binary variables
- nba_1hot <- dplyr::select(nba, LOCATION, W, PTS_TYPE)
- nba_1hot$LOCATION <- ifelse(nba_1hot$LOCATION == "H", 1, 0)
- colnames(nba_1hot)[1] <- "LOCATION_HOME"
- nba_1hot$W <- ifelse(nba_1hot$W == "W", 1, 0)
- colnames(nba_1hot)[2] <- "WIN"
- nba_1hot$PTS_TYPE <- ifelse(nba_1hot$PTS_TYPE == "2", 1, 0)
- colnames(nba_1hot)[3] <- "PTS_TYPE_2"
- #bind the dummy variables to the rest of the columns
- nba_keep <- cbind(nba_keep, nba_1hot)
- #remove the columns we have binarized.
- nba_keep <- dplyr::select(nba_keep, -c(W, LOCATION, PTS_TYPE))
- #place FGM as last column
- nba_keep <- nba_keep %>% dplyr::select(-FGM, FGM)
- #Predicting shot clock from the other values
- lm1 <- lm(SHOT_CLOCK~., data=nba_keep)
- summary(lm1)
- #Step function to find best regression
- step1 <- step(lm1, direction="both")
- plot(step1$residuals)
- #Finding NAs in shot clock of original data. 1987 NAs found
- NAshotclock <- which(is.na(nba_keep$SHOT_CLOCK))
- #Imputing the missing values
- nba_keep[NAshotclock,]$SHOT_CLOCK <- predict(step1, newdata=nba_keep[NAshotclock,])
- #Check difference in histograms between imputed and nonimputed variables
- par(mfrow=c(1,3))
- hist(nba_keep[NAshotclock,]$SHOT_CLOCK, main = "Imputed", xlab = "SHOT CLOCK")
- hist(nba_keep[-NAshotclock,]$SHOT_CLOCK, main = "Original", xlab = "SHOT CLOCK")
- hist(nba_keep$SHOT_CLOCK, main="Imputed + Original", xlab = "SHOT CLOCK")
- nba_keep <- dplyr::select(nba_keep, -c(WIN, FINAL_MARGIN))
- #Checking the correlations between our remaining variables
- corrgram(nba_keep[, -c(8:10)],main="Iris data with example panel functions",
- lower.panel=panel.pts, upper.panel=panel.cor,
- diag.panel=panel.density)
- #Removing period, since it is highly correlated
- nba_keep <- dplyr::select(nba_keep, -PERIOD)
- ```
- ```{r}
- set.seed(60)
- train.index <- createDataPartition(nba_keep$FGM, p = .75, list = FALSE)
- train <- nba_keep[ train.index,]
- test <- nba_keep[-train.index,]
- summary(nba_keep)
- t.test(train$FINAL_MARGIN, test$FINAL_MARGIN)
- ```
- ```{r}
- traindec <- train
- testdec <- test
- #outliers
- traindec.iqr <- traindec
- # Create a variable to store the row id's to be removed
- Outliers <- c()
- # Loop through the list of columns you specified
- for(i in c(1:8)){
- # Get the Min/Max values
- max <- quantile(traindec.iqr[,i],0.75, na.rm=TRUE) + (IQR(traindec.iqr[,i], na.rm=TRUE) * 1.5 )
- min <- quantile(traindec.iqr[,i],0.25, na.rm=TRUE) - (IQR(traindec.iqr[,i], na.rm=TRUE) * 1.5 )
- # Get the id's using which
- idx <- which(traindec.iqr[,i] < min | traindec.iqr[,i] > max)
- # Output the number of outliers in each variable
- print(paste(i, length(idx), sep=''))
- # Append the outliers list
- Outliers <- c(Outliers, idx)
- }
- # Sort, I think it's always good to do this
- Outliers <- sort(Outliers)
- # Remove the outliers
- traindec.iqr <- traindec.iqr[-Outliers,]
- #testdec outlier removal
- testdec.iqr <- testdec
- # Create a variable to store the row id's to be removed
- Outliers <- c()
- # Loop through the list of columns you specified
- for(i in c(1:8)){
- # Get the Min/Max values
- max <- quantile(testdec.iqr[,i],0.75, na.rm=TRUE) + (IQR(testdec.iqr[,i], na.rm=TRUE) * 1.5 )
- min <- quantile(testdec.iqr[,i],0.25, na.rm=TRUE) - (IQR(testdec.iqr[,i], na.rm=TRUE) * 1.5 )
- # Get the id's using which
- idx <- which(testdec.iqr[,i] < min | testdec.iqr[,i] > max)
- # Output the number of outliers in each variable
- print(paste(i, length(idx), sep=''))
- # Append the outliers list
- Outliers <- c(Outliers, idx)
- }
- # Sort, I think it's always good to do this
- Outliers <- sort(Outliers)
- # Remove the outliers
- testdec.iqr <- testdec.iqr[-Outliers,]
- boxplot(traindec.iqr)
- #Unnormalized train and test sets
- train_final <- traindec.iqr
- test_final <- testdec.iqr
- ```
- ##transformation
- ```{r}
- train <- train_final
- test <- test_final
- #Making the last four variables to be factors
- for (i in c(9:12)){
- train[,i] <- as.factor(train[,i])
- test[,i] <- as.factor(test[,i])
- }
- #Making a combined data set so we can check
- normality <- bind_rows(train, test)
- summary(normality)
- #Checking the different variables for normality
- str(normality)#variables 2 to 9 need to be checked for normality
- #First we check final margin
- hist(normality$FINAL_MARGIN) #normaldist
- qqnorm(normality$FINAL_MARGIN) #seems to be normally distributed, but with some deviation in the ends
- skewness(normality$FINAL_MARGIN)#Skewness is 0.012868
- #Shot number
- hist(normality$SHOT_NUMBER, breaks = 5)#looks right skewed
- qqnorm(normality$SHOT_NUMBER)#looks weird since it is a discrete variable
- skewness(normality$SHOT_NUMBER)#Has a skewness of 0.119
- skewness(log(normality$SHOT_NUMBER))#natural log makes the skewness -0.39
- skewness(sqrt(normality$SHOT_NUMBER))#square root makes the skewness -0.14566
- skewness(1/sqrt(normality$SHOT_NUMBER))#Inverse square root makes the skewness 0.5973
- #Period
- hist(normality$PERIOD, breaks = 5)#looks right skewed
- qqnorm(normality$PERIOD)#looks weird since it is a discrete variable
- skewness(normality$PERIOD)#Has a skewness of 0.119
- skewness(log(normality$PERIOD))#natural log makes the skewness -0.39
- skewness(sqrt(normality$PERIOD))#square root makes the skewness -0.14566
- skewness(1/sqrt(normality$PERIOD))#Inverse square root makes the skewness 0.597
- #Shot clock
- range(normality$SHOT_CLOCK)
- normality[which(normality$SHOT_CLOCK == 0),]$SHOT_CLOCK <- 0.001
- hist(normality$SHOT_CLOCK)#looks normal distributed with a small increase in the high shot clocks
- qqnorm(normality$SHOT_CLOCK)#Looks normal but with deviations in the high and low end
- skewness(normality$SHOT_CLOCK)#Has a skewness of 0.0619
- skewness(log(normality$SHOT_CLOCK))#natural log makes the skewness -2.496
- skewness(sqrt(normality$SHOT_NUMBER))#square root makes the skewness -0.14566
- skewness(1/sqrt(normality$SHOT_NUMBER))#Inverse square root makes the skewness 0.597
- #Touch time
- normality[which(normality$TOUCH_TIME == 0),]$TOUCH_TIME <- 0.001
- hist(normality$TOUCH_TIME)#looks right skewed
- qqnorm(normality$TOUCH_TIME)#Doesn't look normal distributed
- skewness(normality$TOUCH_TIME)#Has a skewness of 1.1555
- skewness(log(normality$TOUCH_TIME))#natural log makes the skewness -3.236
- skewness(sqrt(normality$TOUCH_TIME))#square root makes the skewness 0.4296
- skewness(1/sqrt(normality$TOUCH_TIME))#Inverse square root makes the skewness 6.593
- #Shot distance
- normality[which(normality$SHOT_DIST == 0),]$SHOT_DIST <- 0.001
- hist(normality$SHOT_DIST)#Doesn't look normal distributed but more like the inverse of a normal distribution with high values in each end
- qqnorm(normality$SHOT_DIST)#Looks somewhat normal distributed with a deviation in the middle of the plot and in the low end
- skewness(normality$SHOT_DIST)#Has a skewness of 0.12868
- skewness(log(normality$SHOT_DIST))#natural log makes the skewness -0.85977
- skewness(sqrt(normality$SHOT_DIST))#square root makes the skewness -0.212
- skewness(1/sqrt(normality$SHOT_DIST))#Inverse square root makes the skewness 37.2369
- #Closest defender distance
- normality[which(normality$CLOSE_DEF_DIST == 0),]$CLOSE_DEF_DIST <- 0.001
- hist(normality$CLOSE_DEF_DIST, breaks = 100)#Has a small reight skew
- qqnorm(normality$CLOSE_DEF_DIST)#Looks more or less normal, but with deviations in the ends
- skewness(normality$CLOSE_DEF_DIST)#Has a skewness of 0.509
- skewness(log(normality$CLOSE_DEF_DIST))#natural log makes the skewness -4.66
- skewness(sqrt(normality$CLOSE_DEF_DIST))#square root makes the skewness -0.3789
- skewness(1/sqrt(normality$CLOSE_DEF_DIST))#Inverse square root makes the skewness 10.5069
- #Total game time
- hist(normality$TOTAL_GAMETIME, breaks = 100)#doesn't look normal distributed, but looks more or less evenly distributed, but with more values close to 0
- qqnorm(normality$TOTAL_GAMETIME)#Doesn't look normal but has gaps in between values indicating outliers
- skewness(normality$TOTAL_GAMETIME)#Has a skewness of 0.146
- skewness(log(normality$TOTAL_GAMETIME))#natural log makes the skewness -1.5544
- skewness(sqrt(normality$TOTAL_GAMETIME))#square root makes the skewness -0.467
- skewness(1/sqrt(normality$TOTAL_GAMETIME))#Inverse square root makes the skewness 4.39
- ```
- ##Decision trees
- Factor transforming
- ```{r}
- traindec.iqr_noZ <- traindec.iqr
- testdec.iqr_noZ <- testdec.iqr
- #Setting the dummyvariables to factors
- for (i in c(7:9)){
- traindec.iqr[,i] <- as.factor(traindec.iqr[,i])
- testdec.iqr[,i] <- as.factor(testdec.iqr[,i])
- }
- str(traindec.iqr)
- ```
- ```{r}
- #z-score transform train set
- for (i in c(1:6)){
- traindec.iqr[,i] <- (traindec.iqr[,i] - mean(traindec.iqr[,i]))/sd(traindec.iqr[,i])
- }
- summary(traindec.iqr)
- #z-score transform test set
- for (i in c(1:6)){
- testdec.iqr[,i] <- (testdec.iqr[,i] - mean(testdec.iqr[,i]))/sd(testdec.iqr[,i])
- }
- ```
- Decision trees
- ```{r}
- #RPart decision trees
- rpartfit <- train(x=traindec.iqr[,1:8], y=traindec.iqr$FGM, method = "rpart")
- rpartfit$bestTune
- set.seed(10)
- trainrpartcp <- rpart(FGM ~., data = traindec.iqr, method = "class", control = rpart.control(cp = 0.0009221977))
- set.seed(10)
- trainrpart <- rpart(FGM ~., data = traindec.iqr, method = "class")
- summary(trainrpartcp)
- rpart.plot(trainrpartcp)
- set.seed(10)
- estrpartFGM <- predict(trainrpartcp, testdec.iqr, type = "class")
- table(testdec.iqr$FGM, estrpartFGM)
- (13749+4690)/(length(testdec.iqr$SHOT_NUMBER))*100 #Cp = 0.0009221977
- (13061+4418)/(length(testdec.iqr$SHOT_NUMBER))*100 #No cp
- #C5.0 decision trees
- x <- traindec.iqr[,1:8]
- y <- as.factor(traindec.iqr$FGM)
- set.seed(10)
- trainc50 <- C5.0(x,y, control = C5.0Control(minCases = 10))
- summary(trainc50)
- plot(trainc50)
- set.seed(10)
- estFGM <- predict(trainc50, testdec.iqr, type = "class")
- table(estFGM, testdec.iqr$FGM)
- (12766+4515)/(length(testdec.iqr$TOUCH_TIME))*100 #No mincases 61.64%
- (12765+4509)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 5 61.62%
- (12879+4379)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 10 61.64%
- (12807+4458)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 15 61.59%
- (12875+4387)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 20 61.58%
- (12696+4581)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 50 61.63%
- (12928+4345)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 100 61.61%
- (12765+4480)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 200 61.51%
- length(testdec.iqr$TOUCH_TIME)
- ```
- Denormalising salient values
- ```{r}
- shotdist <- -0.9279527*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
- shotdist
- touch <- -0.3420969*sd(traindec.iqr_noZ[,3]) + mean(traindec.iqr_noZ[,3])
- touch
- closestdef <- -0.6427635*sd(traindec.iqr_noZ[,5]) + mean(traindec.iqr_noZ[,5])
- closestdef
- shotclock <- -1.751886*sd(traindec.iqr_noZ[,2]) + mean(traindec.iqr_noZ[,2])
- shotclock
- shotdist2 <- 0.6792926*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
- shotdist2
- shotdist3 <- -0.7593606*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
- shotdist3
- closestdef2 <- -0.6427635*sd(traindec.iqr_noZ[,5]) + mean(traindec.iqr_noZ[,5])
- closestdef2
- shotclock2 <- 1.274499*sd(traindec.iqr_noZ[,2]) + mean(traindec.iqr_noZ[,2])
- shotclock2
- ```
- ```{r}
- #neural net on trainingset
- #Start by normalizing all the data
- nba_train_norm <- train_final
- nba_test_norm <- test_final
- summary(nba_train_norm)
- for (i in c(1:6))
- {
- nbamin <- min(nba_train_norm[,i])
- nbamax <- max(nba_train_norm[,i])
- nba_train_norm[,i] <- (nba_train_norm[,i] - nbamin)/(nbamax - nbamin)
- }
- for (i in c(1:6))
- {
- nbamin <- min(nba_test_norm[,i])
- nbamax <- max(nba_test_norm[,i])
- nba_test_norm[,i] <- (nba_test_norm[,i] - nbamin)/(nbamax - nbamin)
- }
- ```
- ##Neural network analysis
- ```{r}
- #Preparing the data for final preparation before neural net analysis
- nba_train_norm$FGM <- as.factor(nba_train_norm$FGM)
- nba_train_norm$PTS_TYPE_2 <- as.numeric(nba_train_norm$PTS_TYPE_2)
- nba_train_norm$LOCATION_HOME <- as.numeric(nba_train_norm$LOCATION_HOME)
- str(nba_train_norm)
- nba_test_norm$FGM <- as.factor(nba_test_norm$FGM)
- nba_test_norm$PTS_TYPE_2 <- as.numeric(nba_test_norm$PTS_TYPE_2)
- nba_test_norm$LOCATION_HOME <- as.numeric(nba_test_norm$LOCATION_HOME)
- str(nba_test_norm)
- ```
- #Neural net with one and two layers
- ```{r}
- set.seed(40)
- #setting threshold to 3, since there seems to a local minimum that can't be overcome in some iterations
- #Two layer ANN
- net.dat <- neuralnet(formula = FGM~.,data=nba_train_norm, rep=5, hidden=c(5,5), linear.output = F, lifesign = "full", lifesign.step = 100, threshold=3, stepmax=10000)
- #Predicting results
- nn.results <- compute(net.dat, nba_test_norm[,-12])
- #Plotting the plot
- plot(net.dat, rep="best", show.weights = F, col.out = "red", fontsize = 10)
- #Confusion matrix of results
- results <- data.frame(actual = nba_test_norm$FGM, prediction = nn.results$net.result)
- results$pred <- ifelse(results[,2] > results[,3], 0, 1)
- table(results$actual,results$pred)
- #One layer ANN
- net.dat2 <- neuralnet(formula = FGM~.,data=nba_train_norm, rep=5, hidden=5, linear.output = F, lifesign = "full", lifesign.step = 100, threshold=3, stepmax=10000)
- str(nba_train_norm)
- #Predicting the result
- nn.results2 <- compute(net.dat2, nba_test_norm[,-12])
- plot(net.dat2, rep="best", show.weights = F, col.out = "red", fontsize = 10)
- #Confusion matrix of results
- results2 <- data.frame(actual = nba_test_norm$FGM, prediction = nn.results2$net.result)
- results2$pred <- ifelse(results2[,2] > results2[,3], 0, 1)
- table(results2$actual,results2$pred)
- ```
- #Running neural net with nnet function, allowing us to run the Garson method
- ```{r}
- require(devtools)
- set.seed(50)
- #size of 5, based on our rule of thumb
- nbannet <- nnet(nba_train_norm$FGM~., data = nba_train_norm, size = 5, maxit=1000)
- #Confusion matrix of results
- estFGM <- predict(nbannet, nba_test_norm[,-12], type="class")
- estFGM
- table(nba_test_norm$FGM, estFGM)
- #Similar results to our neuralnet() one-layer
- #import 'gar.fun' from Github
- source_gist('6206737')
- cols<-colorRampPalette(c('lightgreen','lightblue'))(length(nba_train_norm[,-11]))
- par(mar=c(3,4,1,1),family='serif')
- #Run garson method
- garfunkel <- gar.fun('FGM', nbannet)
- garfunkel
- #Get table of most salient variables by relative importance
- x <- garfunkel$data$x.names
- y <- garfunkel$data$rel.imp
- table <- as.table(setNames(y, x))
- table <- sort(table, decreasing = T)
- relimp <- as.data.frame(table)
- relimp$abs <- abs(relimp$Freq)
- relimp <- arrange(relimp, abs, decreasing = T)
- relimp$Freq <- round(relimp$Freq, 2)
- relimp$abs <- round(relimp$abs, 2)
- relimp
- ```
- ##KNN
- ```{r knnModel}
- library(pacman)
- p_load(caret, ggplot2)
- knnTrain = train_final
- knnTest = test_final
- # Categoricals to factors
- for (i in c(7:9)){
- knnTrain[,i] <- as.factor(knnTrain[,i])
- knnTest[,i] <- as.factor(knnTest[,i])
- }
- # Adding names to factors due to error when running train() "one of the class levels is not a valid R variable name"
- levels(knnTrain$LOCATION_HOME) = c("A", "H")
- levels(knnTrain$FGM) = c("miss", "made")
- levels(knnTrain$PTS_TYPE_2) = c("three", "two")
- levels(knnTest$LOCATION_HOME) = c("A", "H")
- levels(knnTest$FGM) = c("miss", "made")
- levels(knnTest$PTS_TYPE_2) = c("three", "two")
- # Setting parameters for tuning repetitions and cross-validation in model
- ctrl <- trainControl(
- classProbs = TRUE,
- method = "cv", # cross-validation method
- number = 5 # number of folds
- )
- # Modeling with training set
- fgmknnModel <- train(FGM~., data = knnTrain, method = "knn", tuneGrid=expand.grid(k = 200), trControl = ctrl)
- fgmknnModel
- # Determining and plotting the variable importance
- fgmimpvar = varImp(fgmknnModel, scale = T)
- ggplot(fgmimpvar)
- # Predicting shot outcome and establishing final accuracy using test set
- TestPred <- predict(fgmknnModel, knnTest)
- # Creating the confusion matrix using the predicted and test values for FGM
- confusionMatrix(TestPred, knnTest$FGM, positive = "made")
- ```

RAW Paste Data

We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.