SHARE
TWEET

ass

a guest Sep 11th, 2019 135 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ---
  2. title: "NBA_ONLY_CLEANING"
  3. author: "Malte HĂžjmark-Bertelsen"
  4. date: "7/16/2019"
  5. output: html_document
  6. ---
  7.  
  8. ```{r setup, include=FALSE}
  9. knitr::opts_chunk$set(echo = TRUE)
  10. library(pacman)
  11. p_load(tidyverse, corrgram, lubridate, gridExtra, caret, tidyverse, BSDA, rpart, rpart.plot, C50, randomForest, class, pastecs, propagate, utils, clusterGeneration)
  12. ```
  13.  
  14. ```{r}
  15. setwd("/cloud/project")
  16. nba <- read.csv("shot_logs.csv")
  17. nba_raw <- nba
  18. ```
  19.  
  20. ```{r}
  21.  
  22. ```
  23.  
  24.  
  25. ```{r}
  26. #Justyfing that shot results and fgm are the same data
  27. ls.str(nba)
  28. nba$SHOT_RESULT <- nba$SHOT_RESULT == "made"
  29. nba$FGM <- nba$FGM == 1
  30. summary(nba$SHOT_RESULT == nba$FGM)
  31. summary(nba)
  32. #SHOT_RESULTS are equal to FGMs we can therefore remove one of these variables as they say the exact same thing:
  33. nba <- dplyr::select(nba, -SHOT_RESULT)
  34.  
  35. ##MATCHUP handling
  36. #Taking the date out of MATCHUP
  37. nba$DATE <- gsub("-.*", "", nba$MATCHUP)
  38. #Creating a DATE column
  39. nba$DATE <- as.Date(nba$DATE, format="%b %d, %Y")
  40.  
  41. #Taking the teams and making a shooting/defending team column.
  42. nba$ONLYTEAMS <- gsub(".*-", "", nba$MATCHUP)
  43. nba$SHOOTING_TEAM <- str_extract(nba$ONLYTEAMS, regex("\\w+"))
  44. nba$DEFENDING_TEAM <- str_extract(nba$ONLYTEAMS, regex("[.|@] \\w+"))
  45. nba$DEFENDING_TEAM <- gsub("[.|@] ", "", nba$DEFENDING_TEAM)
  46. nba <- dplyr::select(nba, -ONLYTEAMS)
  47.  
  48. #Changing FGM into 1s and 0s
  49. nba$FGM <- ifelse(nba$FGM ==T, 1, 0)
  50.  
  51. #Changing GAME_CLOCK and SHOT_CLOCK into minutes
  52. nba$GAME_CLOCK <- period_to_seconds(ms(nba$GAME_CLOCK)) / 60
  53. nba$SHOT_CLOCK <- nba$SHOT_CLOCK/60
  54.  
  55. #Creating a TOTAL_GAMETIME variable in minutes
  56. nba$TOTAL_GAMETIME <- ifelse(
  57.   as.numeric(as.character(nba$PERIOD)) == 1,
  58.   12 - nba$GAME_CLOCK,
  59.   ((as.numeric(as.character(nba$PERIOD)) - 1) * 12) + (12 - nba$GAME_CLOCK)
  60.   )
  61.  
  62.  
  63. #Changing shooting player name format into lastname, firstname.
  64. #nba$player_name <- as.person(nba$player_name)
  65. #nba$player_name <- format(nba$player_name, include=c("family","given"), braces=list(family=c("",",")))
  66. ```
  67.  
  68. ```{r}
  69. #Include in outliers section           #To check if these observations have some trends or patterns in other variables, then we make a new dataframe including these observations
  70. nba_touchtime_outliers <- nba[which(nba$TOUCH_TIME <= 0),]
  71.  
  72. #First we look at game id, but from the plots they look more or less the same
  73. plot(nba_touchtime_outliers$GAME_ID)
  74. plot(nba$GAME_ID)
  75.  
  76. #Next we look at matchup, which looks the same
  77. plot(nba_touchtime_outliers$MATCHUP)
  78. plot(nba$MATCHUP)
  79.  
  80. #Next we look at location, which look the same
  81. plot(nba_touchtime_outliers$LOCATION)
  82. plot(nba$LOCATION)
  83.  
  84. #Next we look at win or lose, where there is a small difference, since the outliers have a higher proportion being lose and the original data has a higher proportion being win, but as both are very much close to 50%, then the difference will not have a big impact on the original data
  85. plot(nba_touchtime_outliers$W)
  86. plot(nba$W)
  87.  
  88. #Next we look at final margin, which looks more or less the same
  89. hist(nba_touchtime_outliers$FINAL_MARGIN)
  90. hist(nba$FINAL_MARGIN)
  91.  
  92.  
  93. #Next we look at shot number, which looks the same
  94. hist(nba_touchtime_outliers$SHOT_NUMBER)
  95. hist(nba$SHOT_NUMBER)
  96.  
  97. #Next we look at period. There is a small difference, but nothing important
  98. plot(nba_touchtime_outliers$PERIOD)
  99. plot(nba$PERIOD)
  100.  
  101. #next we look at game clock. This looks more or less the same
  102. hist(nba_touchtime_outliers$GAME_CLOCK)
  103. hist(nba$GAME_CLOCK)
  104.  
  105. #Next we look at shot clock. This looks weird, since the the touch time thas is negative or 0 has a much higher proportion where the shot clock is high compared to the original dataset. This could potentially be because that if the ball hist the backboard and doesn't go in, then it resets the shot clock, but if another guy jumps up and dunks it while it is in the air, then he would have a very low touch time, potentially counted as 0, and the shot clock would have reset so that would be high. This is called a putback
  106. par(mfrow=c(4,2))
  107. hist(nba_touchtime_outliers$SHOT_CLOCK, breaks = 20, main = "Erroneous Data", xlab = "Shot Clock", ylab = "Frequency")
  108. hist(nba$SHOT_CLOCK, breaks = 20, main = "Erroneous Data", xlab = "SHot Clock", ylab = "Frequency")
  109. #If it is a putback, then the shotclock should have reset and be very high. Since the ball could be in the air for maybe one second we will look at the data that has shot clock higher than or equal to 23 (0.38 minutes) and that has a touch time of 0, since these are the ones potentially a putback which happens a lot
  110. length(which(nba_touchtime_outliers$TOUCH_TIME == 0))
  111. length(which(nba_touchtime_outliers$TOUCH_TIME == 0 & nba_touchtime_outliers$SHOT_CLOCK >= 0.38))
  112. #This shows that out of the 3046 observations with touchtime equal to 0, then 1269 had a touch time equal to 0 and a shot clock of 23 or higher. This does to some extend go against our theory, since the shot clock would have to be high for it to be a putbag, but the 1269 could still potentially be putbags. If it is a putback, then it should be a 2 pointer and it should be close to the hoop.
  113. putback <- nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_CLOCK >= 0.38),]
  114. hist(putback$SHOT_DIST, breaks = 100)
  115. #This shows that the biggest proportion of the data has a distance close to 0, so it could potentially be that a guy jumped towards the hoop and did a putback and the distance was measured from where he jumped, but also that there is a spike just a bit further out which goes against our theory also.
  116. plot(as.factor(putback$PTS_TYPE))#This shows that the biggest proportion of the shots where 2-pointers, which make sense if it is a putback too
  117.  
  118. #Next thing is dribbles, where all of the outliers for touch time has 0 dribbles, but the original dataset has also the highest amount around 0, but has a decending trend to around 20.
  119. hist(nba_touchtime_outliers$DRIBBLES, breaks = 30, main = "Erroneous Data", xlab = "Dribbles", ylab = "Frequency")
  120. hist(nba$DRIBBLES, breaks = 30, main = "Original Data", xlab = "Dribbles", ylab = "Frequency")
  121.  
  122.  
  123. length(which(nba_touchtime_outliers$DRIBBLES > 0))
  124. hist(nba_touchtime_outliers$SHOT_DIST)
  125. length(which(nba_touchtime_outliers$SHOT_DIST > 7.12))
  126. 780/3358*100
  127. nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_DIST == 20),]
  128. plot(nba_touchtime_outliers$FGM)
  129. length(which(nba_touchtime_outliers$TOUCH_TIME == 0))
  130. length(which(nba_touchtime_outliers$SHOT_DIST <= 7.12))
  131. summary(nba_touchtime_outliers$SHOT_DIST)
  132. attach(nba_touchtime_outliers)
  133. length(which(TOUCH_TIME < 0 & DRIBBLES == 0))
  134. length(which(TOUCH_TIME < 0))
  135. nba_touchtime_outliers[which(nba_touchtime_outliers$SHOT_DIST == 44.8),]
  136. hist(nba_touchtime_outliers$CLOSE_DEF_DIST)
  137.  
  138. length(which(nba_touchtime_outliers$TOUCH_TIME < 0))
  139.  
  140. hist(putback$SHOT_DIST, breaks = 100)
  141.  
  142.  
  143.  
  144. hist(putback$DRIBBLES)
  145. summary(putback$SHOT_DIST)
  146. hist(putback$SHOT_DIST, breaks = 100)
  147.  
  148. alley_oop <- nba_touchtime_outliers[which(nba_touchtime_outliers$DRIBBLES == 0 & nba_touchtime_outliers$SHOT_DIST <7.12 & nba_touchtime_outliers$SHOT_CLOCK < 0.38),]
  149.  
  150. length(which(missing_df$SHOT_DIST > 7.12 & missing_df$TOUCH_TIME == 0))
  151.  
  152. length(which(putback$SHOT_DIST >= 7.12))
  153.  
  154. #Looking at how big the proportion of dribbles being 0 of the outliers and the original dataset
  155. length(which(nba_touchtime_outliers$DRIBBLES == 0))#This being 3347
  156. length(which(nba$DRIBBLES == 0))#This being 63195
  157. 3347/63195*100#This mean that if we delete the outliers then we would delete 5.3% of the original dataset that has dribbles being 0. The big proportion of the outliers having 0 dribbles support that the outliers must have been an emidiate shot when getting the ball, so it could be a putback or a quick shot from getting the ball, meaning that the touchtime could have been recorded as 0
  158.  
  159. #Next we look at shot distance, that shows that the outliers have a higher proportion being close to the hoop, while the original dataset is more focused around 5 and 25 feet. This again could support the fact that it is a putback that happens close to the hoop
  160. hist(nba_touchtime_outliers$SHOT_DIST, breaks = 100, main = "Erroneous Data", xlab = "Shot Distance", ylab = "Frequency")
  161. hist(nba$SHOT_DIST, breaks = 100, main = "Original Data", xlab = "Shot Distance", ylab = "Frequency")
  162.  
  163. #Next is pts_type, which shows that the outliers again have a higher proportion being 2 point shots compared to 3 point shots compared to the original data, which would happen if it is putbacks or emidiate shots to get a quick 3
  164. plot(as.factor(nba_touchtime_outliers$PTS_TYPE), main = "Erroneous Data", xlab="Points Type", ylab="Frequency")
  165. plot(as.factor(nba$PTS_TYPE), main = "Original Data", xlab="Points Type", ylab="Frequency")
  166.  
  167.  
  168.  
  169. #Next is closest defender, which looks kinda the same
  170. plot(nba_touchtime_outliers$CLOSEST_DEFENDER)
  171. plot(nba$CLOSEST_DEFENDER)
  172.  
  173. #Next is the closest defender distance, which looks kinda the same
  174. hist(nba_touchtime_outliers$CLOSE_DEF_DIST, breaks = 100)
  175. hist(nba$CLOSE_DEF_DIST, breaks = 100)
  176. mean(nba_touchtime_outliers$CLOSE_DEF_DIST)#Mean is 3.29
  177. mean(nba$CLOSE_DEF_DIST)#Mean is 4.12
  178. #So there is a small difference with the outliers having a closer defender distance
  179.  
  180. #Next is field goal made, which looks kinda the same. This is weird since putbacks are more of a sure thing than shooting, so we would expect that for the outliers there would be higher proportion of hitting the field goal than missing, which is not the case
  181. plot(nba_touchtime_outliers$FGM)
  182. plot(nba$FGM)
  183.  
  184. #Next is points made, which has a small difference in the fact that it has a higher proportion being 0 or 2, than the original dataset
  185. hist(nba_touchtime_outliers$PTS, breaks = 3)
  186. hist(nba$PTS, breaks = 3)
  187.  
  188. #Next is player ID and since player name is the same we wont do that. It shows more or less the same, but that the outliers are more present for two specific players
  189. plot(nba_touchtime_outliers$player_id)
  190. plot(nba$player_id)
  191. ```
  192.  
  193. ```{r}
  194. #Indetifying missing data
  195. summary(nba)# Which shows that there are missing data in the shot clock.
  196. #Making a variable for the observations with missing data
  197. nba$missing1 <- ifelse(is.na(nba$SHOT_CLOCK), 1, 0)
  198. #Making a datafram for the missing data
  199. missing_df <- nba[nba$missing1==1,]
  200. 5567/128069*100#4.35% missing data out of all of the data
  201. summary(missing_df)
  202. ```
  203.  
  204. ```{r}
  205. ###Looking into if there are any patterns or trends in the missing data, to see if there are any danger in deleting the missing data
  206. ```
  207.  
  208. ```{r}
  209. #First we look at game clock
  210. par(mfrow = c(1,2))
  211. hist(missing_df$GAME_CLOCK, breaks = 50)#Showing the histogram for the game clock of the missing data
  212. hist(nba$GAME_CLOCK, breaks = 50)#showing the histogram for the game clock of the original data
  213. #It seems like there is a trend in the missing data here where there are more of the missing data that has a low game clock than the original data. To check this we calculate the percentage of values that has a low game clock (1) compared to the total amount of values for game clock in each data frame
  214. #Here we check the percentage of game clocks that are less than 1 in the missing data frame
  215. table(missing_df$GAME_CLOCK <= "1")
  216. 3651/5567*100#it is 65.58%
  217. #Here we check the percentage of game clocks that are less than 1 in the original data frame
  218. table(nba$GAME_CLOCK<="1")
  219. 12625/128069*100#It is 9.86%
  220. #This shows that the missing data has a much higher percentage of low game clocks than the original data, meaning that if we delete the missing data, then there might be some bias in the left over data in terms of game clock, since much of the lower game clocks will be deleted compared to the higher game clocks.
  221. #Furthermore, this might be explained by the fact that if the game clock is less than 24 seconds or 0.4 minutes, and the shot clock is reset, then it wont show, since the game clock automatically becomes the shot clock
  222.  
  223. #From looking at videos https://www.youtube.com/watch?v=w_Bk1quPHhY, then we can see that if it is in the final seconds of the match, then there is no shot clock, which makes sense, since there the game clock becomes the shot clock.
  224. #Checking if it is always the case that the game clock is higher than the shot clock
  225. length(which(nba$GAME_CLOCK < nba$SHOT_CLOCK))
  226. # this shows that there are 13 observations out of the 128069, where the game clock is less than the shot clock, which indicates that it doesn't really happen and that, which supports that there won't be a shot clock if the game clock is low
  227.  
  228. #Replacing SHOT_CLOCK NAs with GAME_CLOCK time points where its below 0.4 (corresponding to 24 seconds)
  229. nba[which(is.na(nba$SHOT_CLOCK) & nba$GAME_CLOCK<=0.4),]$SHOT_CLOCK = nba[which(is.na(nba$SHOT_CLOCK) & nba$GAME_CLOCK<=0.4),]$GAME_CLOCK
  230.  
  231. #Making a new dataframe for the missing data
  232. nba$missing1 <- ifelse(is.na(nba$SHOT_CLOCK), 1, 0)
  233. missing_df <- nba[nba$missing1==1,]
  234. ```
  235.  
  236. ```{r}
  237. #First looking at the game_id
  238. par(mfrow=c(1,2))
  239. plot(missing_df$GAME_ID)#Showing the game id and how many missing datapoints there where in it. This indicates that there are some game_id that has very high missing values and actually a couple of matches where there where almost all of the observations for that match didn't have shotclock
  240. plot(nba$GAME_ID)
  241.  
  242. df_GAME_ID <- data.frame(table(missing_df$GAME_ID))#new datafram for the game id's showing the frequency of missing data
  243. #showing the summary of the frequency of missing data in the game_id
  244. summary(df_GAME_ID$Freq)
  245. #To figure out how many game_id that has significant higher amount of missing data than the rest, then we make use of the IQR, but first we visualize it with a boxplot
  246. boxplot(df_GAME_ID$Freq)
  247. #Now we calculate it
  248. IQR(df_GAME_ID$Freq)
  249. 5+1.5*IQR(df_GAME_ID$Freq)#This shows the amount of missing data that a game_id must have (8) before it becomes an outlier.
  250. #Now we figure out how many has this
  251. length(which(df_GAME_ID$Freq>8))
  252. 24/904*100 #as we have 24 game_id with significant higher amount of missing data then the rest, then we should be aware of deleting the missing data would have a higher effect on those game_id then the rest. Alternative, by deleting those 24 games that already had a significant higher amount of missing data would be to delete 2.65% of the games from the dataset
  253. ```
  254.  
  255. ```{r}
  256. #Next we look at location
  257. par(mfrow=c(1,2))
  258. plot(missing_df$LOCATION)#showing the distribution of A and H for missing data
  259. plot(nba$LOCATION)#showing the distribution of A and H for all the data
  260. #To get more in detail we look at the count
  261. table(missing_df$LOCATION)/2013*100#This shows that in the missing data then 49.98% of them where away and 51.02% where home
  262. table(nba$LOCATION)/128069*100#This shows that in all the data then 50.08% where away and 49.92% where home
  263. #All in all the missing data seems like it is random for the location variable by having the same proportion being away and home as in the original dataset
  264. ```
  265.  
  266. ```{r}
  267. #next we look at win or lose
  268. par(mfrow=c(1,2))
  269. plot(missing_df$W)#Shows the distribution of w and l for the missing data
  270. plot(nba$W)#shows the distribution of w and l for the original data
  271. #These seem to be very simular, but to check further then we look at the percentages
  272. table(missing_df$W)/2013*100#Shows that 49.58% where L and 50.42% where W
  273. table(nba$W)/128069*100#Shows that 49.56% where L and 50.44% where W
  274. #All in all the missing data and the original data has the same proportion of win and lose
  275. ```
  276.  
  277. ```{r}
  278. #next we check final margin
  279. hist(missing_df$FINAL_MARGIN)#Shows the histogram for the final margin in the missing data
  280. hist(nba$FINAL_MARGIN)#Shows the histogram for the final margin for the original dataset
  281. #They seem to have small difference, but to check further we can use the mean and standard deviation as measures
  282. #First we calculate it for the missing data
  283. mean(missing_df$FINAL_MARGIN)#It is -0.028
  284. sd(missing_df$FINAL_MARGIN)#it is 10.34
  285. #Now for the original data
  286. mean(nba$FINAL_MARGIN)#it is 0.209
  287. sd(nba$FINAL_MARGIN)#it is 13.233
  288. #All in all the histograms seem have a mirrored distribution, but the mean and standard deviation is more or less the same, meaning that it seems that the missing data is randomly taken for this variable
  289. ```
  290.  
  291. ```{r}
  292. #Next we check for shot number
  293. hist(missing_df$SHOT_NUMBER)#Shows the histogram for the shot number of the missing data
  294. hist(nba$SHOT_NUMBER)#Shows the histogram for the shot number of the original data
  295. #The histograms seem to be more or less the same, but we can check this further with some values for center and spread
  296. #First we look at the missing data
  297. mean(missing_df$SHOT_NUMBER)#it is 4.95
  298. sd(missing_df$SHOT_NUMBER)#it is 3.70
  299. #Then we look at the original data
  300. mean(nba$SHOT_NUMBER)#the mean is 6.51
  301. sd(nba$SHOT_NUMBER)#it is 4,71
  302. #All in all it seems like the missing data is also randomly taken from the original data, but that the mean and spread of the missing data is a bit smaller than the original data
  303. ```
  304.  
  305. ```{r}
  306. #Next we look at period
  307. plot(missing_df$PERIOD, main="Missing Data", xlab="Period", ylab="Frequency")#Shows the plot for period of the missing data
  308. plot(nba$PERIOD, main="Original data", xlab="Period", ylab="Frequency")#Shows the plot for period of the original data
  309. #These seem to be a bit different, since the missing data has a higher proportion being in period 1 and 2, while the original data is more or less the same for the first four periods
  310. ```
  311.  
  312. ```{r}
  313. #Since shot clock is the what the missing data is missing, then we wont check that but go to dribbles instead
  314. hist(missing_df$DRIBBLES, breaks = 30)#Showing the histogram for the variable dribbles in the missing data
  315. hist(nba$DRIBBLES, breaks = 30)#Showing the histogram for the variable dribbles in the original data
  316. #It seems likes it is more or less the same, but to check further we look at the mean and standard deviation
  317. barplot(table(missing_df$DRIBBLES)/2013*100)#This shows the barplot of the percentages of dribbles for missing data
  318. barplot(table(nba$DRIBBLES)/128069*100)#This shows the barplot of the percentages of dribbles for original data
  319. #We can see that the missing data has a higher percentage of 0 dribbles than the original dataset
  320. table(missing_df$DRIBBLES)/2013*100
  321. table(nba$DRIBBLES)/128069*100
  322. #From the tables we can see that it is only a 3% point difference, and since the plots seem to be the same there is not systematic here
  323. ```
  324.  
  325. ```{r}
  326. #Next we look at touch time       This on looks weird because of the huge outliers of negative touch time, which shouldn't be possible
  327. par(mfrow=c(1,2))
  328. hist(missing_df$TOUCH_TIME, main = "Missing Data", xlab = "Touch Time", ylab = "Frequency")
  329. hist(nba$TOUCH_TIME, main = "Original Data", xlab = "Touch Time", ylab = "Frequency")
  330. #To look further into it we calculate the mean and spread
  331. mean(missing_df$TOUCH_TIME)#It is 2.42
  332. sd(missing_df$TOUCH_TIME)#It is 4.53
  333. mean(nba$TOUCH_TIME)#It is 2.77
  334. sd(nba$TOUCH_TIME)#It is 3.04
  335. #Both histograms from the missing data section seems weird and it seems like there is some values that are
  336. range(nba$TOUCH_TIME)#This shows that the touch time is ranging from -163.6 to 24.9. This seems weird since touch time shouldn't be less than 0 and also not 0, but also it shouldn't be higher than the shotclock which in NBA is 24
  337. #To see how many observations there are where the touch time is less than or equal to 0
  338. length(which(nba$TOUCH_TIME <= 0))#Shows that there are 3358 observations with a touchtime equal to or less than 0
  339.  
  340.  
  341. ```
  342.  
  343. ```{r}
  344. #Next we look at shot distance, which looks the same
  345. hist(missing_df$SHOT_DIST)
  346. hist(nba$SHOT_DIST)
  347. #To look further into it we calculate the mean and standard deviation
  348. mean(missing_df$SHOT_DIST)#It is 13.69
  349. sd(missing_df$SHOT_DIST)#It is 8.85
  350. mean(nba$SHOT_DIST)#it is 13.57
  351. sd(nba$SHOT_DIST)#it is 8.89
  352. boxplot(missing_df$SHOT_DIST)
  353. boxplot(nba$SHOT_DIST)
  354. #all in all it seems like there are no pattern or trend here
  355. ```
  356.  
  357. ```{r}
  358. #Next we look at pts type
  359. plot(as.factor(missing_df$PTS_TYPE))
  360. plot(as.factor(nba$PTS_TYPE))
  361. #It seems like there is not difference between the original and missing data, which indicates no pattern
  362. #To look futher into this we will look at the percentage of 2 and 3 for both
  363. table(missing_df$PTS_TYPE)/2013*100#There are 73.82% of 2 and 26.18% of 3
  364. table(nba$PTS_TYPE)/128069*100#There are 73.53% of 2 and 26.47% of 3
  365. #All in all there are no difference in the data with this variable for the missing data and the original data
  366. ```
  367.  
  368. ```{r}
  369. #Next we look at closest defender, and since defender and defender ID is the same, then we won't look at defender ID      #Need help to find out how we visualize this
  370. plot(missing_df$CLOSEST_DEFENDER)
  371. plot(nba$CLOSEST_DEFENDER)
  372. #This shows that there are some defending players there are present a lot more in the missing data than others. It also shows that the amount of times are no way near the same for the original data, meaning that defending players most likely wouldn't lose too much of their data if the missing data were deleted
  373. ```
  374.  
  375. ```{r}
  376. #Next we look at closest defender distance, which shows that the missing data and the original data looks the same in closest defender distance
  377. hist(missing_df$CLOSE_DEF_DIST, breaks = 40)
  378. hist(nba$CLOSE_DEF_DIST, breaks = 40)
  379. ```
  380.  
  381. ```{r}
  382. #Next is field goal made, which shows that there is a small difference in proportion distribution between 0 and 1 for the missing data and the original data
  383. plot(missing_df$FGM)
  384. plot(nba$FGM)
  385. #To look further into this we look at the percentages
  386. table(missing_df$FGM)/2013*100#Shows that 55.74% of the missing data was miss and that 44.26% was a hit
  387. table(nba$FGM)/128069*100#Shows that 54.79% of the original data was miss and that 45.21% was a hit
  388. #This shows that there isn't any difference and that no patterns exist here either.
  389. ```
  390.  
  391. ```{r}
  392. #Next we look at points.
  393. hist(missing_df$PTS, breaks = 3)
  394. hist(nba$PTS, breaks = 3)
  395. #This looks very much the same, but to further check we look at the mean and standard deviation
  396. mean(missing_df$PTS)#This is 0.98
  397. sd(missing_df$PTS)#This is 1.13
  398. mean(nba$PTS)#This is 0.997
  399. sd(nba$PTS)#This is 1.13
  400. #This shows that there is no trend or pattern here
  401. ```
  402.  
  403. ```{r}
  404. #Next we look at player, by the ID
  405. plot(missing_df$player_id)
  406. plot(nba$player_id)
  407. #Shows that some players a more represented in the missing data than other, but this is also the case in the original data and the missing data is no way near the values of the original data, so most likely there won't be any person effected significantly if the data was removed
  408. ```
  409.  
  410. ```{r}
  411. hist(year(missing_df$DATE), breaks = 2)
  412. hist(year(nba$DATE), breaks = 2)
  413. #This shows that the missing data has a higher proportion of observations from 2015, where the original data has more from 2014. Deleting the missing data would result in a higher difference in the original data then it already is, but it is still only 3358 observations out of 128069
  414. ```
  415.  
  416. ```{r}
  417. #Next we look shooting team
  418. par(mfrow=c(2,2))
  419. plot(missing_df$SHOOTING_TEAM, main= "Missing Data", xlab= "Shooting Team", ylab= "Frequency")
  420. plot(nba$SHOOTING_TEAM, main= "Original Data", xlab= "Shooting Team", ylab = "Frequency")
  421. #Here it shows that the original data is more or less even across all shooting team, but in the missing data, there is a match with mich higher missing data
  422. table(missing_df$SHOOTING_TEAM)
  423. #This shows that it is the LAL team that has a lot of missing data (493).
  424. length(which(missing_df$SHOOTING_TEAM == "LAL" & missing_df$LOCATION == "H"))
  425. 323/493*100#65.5% of the time, then the missing data for LAL is when they are the shooting team at home
  426. ```
  427.  
  428. ```{r}
  429. #Next we look at defending team
  430. plot(missing_df$DEFENDING_TEAM, main = "Missing Data", xlab="Defending Team", ylab="Frequency")
  431. plot(nba$DEFENDING_TEAM, main = "Original Data", xlab = "Defending Team", ylab = "Frequency")
  432. #Again the original data is more or less the same distribution, where the missing data has one team that is missing much more data
  433. table(missing_df$DEFENDING_TEAM)#Shows that LAL is appering 464
  434. #Again we see that it is LAL that has the highest amount of missing data when they are defending. Also all the teams that had hight or low amount of missing data, then has the same as defending team. Could mean that their camaras are bad or the system is faulty.
  435. length(which(missing_df$DEFENDING_TEAM == "LAL" & missing_df$LOCATION == "A"))
  436. 312/464*100#67.2% of the time that LAL has missing data as the defending team it is home
  437.  
  438. #The percentage of missing data from LAL that came from them being home
  439. (323+312)/(493+464)*100#66.35%
  440.  
  441. #Making a data frame for all the missing observations from the team LAL
  442. nba_LAL_DEF <- missing_df[missing_df$DEFENDING_TEAM == "LAL",]
  443. nba_LAL_SHO <- missing_df[missing_df$SHOOTING_TEAM == "LAL",]
  444. nba_LAL <- rbind(nba_LAL_DEF, nba_LAL_SHO)
  445.  
  446. #Checking if there is a pattern for LAL
  447. par(mfrow=c(1,2))
  448. plot(nba_LAL$LOCATION)
  449. plot(nba$LOCATION)
  450. plot(nba_LAL$W)
  451. plot(nba$W)
  452. hist(nba_LAL$FINAL_MARGIN)
  453. hist(nba$FINAL_MARGIN)
  454. hist(nba_LAL$SHOT_NUMBER)
  455. hist(nba$SHOT_NUMBER)
  456. #Here there is a difference in the periods for the LAL to the original
  457. plot(nba_LAL$PERIOD, main = "Los Angeles Lakers", xlab = "Period", ylab="Frequency")
  458. plot(nba$PERIOD, main = "Original Data", xlab = "Period", ylab = "Frequency")
  459. hist(nba_LAL$GAME_CLOCK)
  460. hist(nba$GAME_CLOCK)
  461. hist(nba_LAL$DRIBBLES)
  462. hist(nba$DRIBBLES)
  463. hist(nba_LAL$TOUCH_TIME)
  464. hist(nba$TOUCH_TIME, breaks = 100)
  465. hist(nba_LAL$SHOT_DIST)
  466. hist(nba$SHOT_DIST)
  467. plot(nba_LAL$PTS_TYPE)
  468. plot(nba$PTS_TYPE)
  469. hist(nba_LAL$CLOSE_DEF_DIST)
  470. hist(nba$CLOSE_DEF_DIST, breaks = 100)
  471. plot(nba_LAL$FGM)
  472. plot(nba$FGM)
  473. hist(nba_LAL$PTS, breaks = 3)
  474. hist(nba$PTS, breaks = 3)
  475. ```
  476.  
  477. ```{r}
  478. #last is game time
  479. hist(missing_df$TOTAL_GAMETIME)
  480. hist(nba$TOTAL_GAMETIME)
  481. #Here we see that the missing data is more focused around lower values (0-20) compared to the original dataset that is more evenly distributed for 0 to 40
  482. ```
  483.  
  484.  
  485. ```{r}
  486. #Changing anomalies of a touch time above 24 seconds to 24
  487. nba$TOUCH_TIME[which(nba$TOUCH_TIME > 24)] <- 24
  488.  
  489. #Changing negative touch times to 0.
  490. nba$TOUCH_TIME[which(nba$TOUCH_TIME < 0)] <- 0
  491.  
  492. nba_noremoved <- nba
  493.  
  494. #Because we are taking the assumption that a touch time = 0 is alley oops we need to delete shots with a shot distance above the mean, decided by looking at videos at where players shoot from.
  495. summary(nba[which(nba$TOUCH_TIME == 0),]$SHOT_DIST) #mean value = 7.12
  496. nba <- nba[-which(nba$TOUCH_TIME == 0 & nba$SHOT_DIST > 7.12),]
  497.  
  498. #Because of this assumption we also need to remove any shots with dribbles above 0
  499. nba <- nba[-which(nba$TOUCH_TIME == 0 & nba$DRIBBLES > 0),]
  500.  
  501. #Changing variables into correct classes
  502. nba$GAME_CLOCK <- as.numeric(nba$GAME_CLOCK)
  503. nba$player_id <- as.factor(nba$player_id)
  504. nba$CLOSEST_DEFENDER_PLAYER_ID <- as.factor(nba$CLOSEST_DEFENDER_PLAYER_ID)
  505. nba$FGM <- as.factor(nba$FGM)
  506. nba$GAME_ID <- as.factor(nba$GAME_ID)
  507. nba$DEFENDING_TEAM <- as.factor(nba$DEFENDING_TEAM)
  508. nba$SHOOTING_TEAM <- as.factor(nba$SHOOTING_TEAM)
  509. nba$PTS_TYPE <- as.factor(nba$PTS_TYPE)
  510. nba$FGM <- as.factor(nba$FGM)
  511. nba$PERIOD <- as.factor(nba$PERIOD)
  512. ls.str(nba)
  513. summary(nba)
  514.  
  515. ```
  516.  
  517. #Newer imputation code
  518.  
  519. ```{r}
  520. nba_keep <- dplyr::select(nba, LOCATION, W, FINAL_MARGIN, SHOT_NUMBER, PERIOD, SHOT_CLOCK, TOUCH_TIME, SHOT_DIST, PTS_TYPE, CLOSE_DEF_DIST, TOTAL_GAMETIME, FGM)
  521. str(nba_keep)
  522.  
  523. nba_keep$PERIOD <- as.numeric(nba$PERIOD)
  524. nba_keep$SHOT_NUMBER <- as.numeric(nba$SHOT_NUMBER)
  525.  
  526. ls.str(nba_keep)
  527.  
  528. #Turn LOCATION, W, PTS_TYPE into binary variables
  529. nba_1hot <- dplyr::select(nba, LOCATION, W, PTS_TYPE)
  530. nba_1hot$LOCATION <- ifelse(nba_1hot$LOCATION == "H", 1, 0)
  531. colnames(nba_1hot)[1] <- "LOCATION_HOME"
  532. nba_1hot$W <- ifelse(nba_1hot$W == "W", 1, 0)
  533. colnames(nba_1hot)[2] <- "WIN"
  534. nba_1hot$PTS_TYPE <- ifelse(nba_1hot$PTS_TYPE == "2", 1, 0)
  535. colnames(nba_1hot)[3] <- "PTS_TYPE_2"
  536.  
  537. #bind the dummy variables to the rest of the columns
  538. nba_keep <- cbind(nba_keep, nba_1hot)
  539. #remove the columns we have binarized.
  540. nba_keep <- dplyr::select(nba_keep, -c(W, LOCATION, PTS_TYPE))
  541. #place FGM as last column
  542. nba_keep <- nba_keep %>% dplyr::select(-FGM, FGM)
  543.  
  544. #Predicting shot clock from the other values
  545. lm1 <- lm(SHOT_CLOCK~., data=nba_keep)
  546. summary(lm1)
  547. #Step function to find best regression
  548. step1 <- step(lm1, direction="both")
  549. plot(step1$residuals)
  550.  
  551. #Finding NAs in shot clock of original data. 1987 NAs found
  552. NAshotclock <- which(is.na(nba_keep$SHOT_CLOCK))
  553. #Imputing the missing values
  554. nba_keep[NAshotclock,]$SHOT_CLOCK <- predict(step1, newdata=nba_keep[NAshotclock,])
  555.  
  556. #Check difference in histograms between imputed and nonimputed variables
  557. par(mfrow=c(1,3))
  558. hist(nba_keep[NAshotclock,]$SHOT_CLOCK, main = "Imputed", xlab = "SHOT CLOCK")
  559. hist(nba_keep[-NAshotclock,]$SHOT_CLOCK, main = "Original", xlab = "SHOT CLOCK")
  560. hist(nba_keep$SHOT_CLOCK, main="Imputed + Original", xlab = "SHOT CLOCK")
  561.  
  562. nba_keep <- dplyr::select(nba_keep, -c(WIN, FINAL_MARGIN))
  563.  
  564. #Checking the correlations between our remaining variables
  565. corrgram(nba_keep[, -c(8:10)],main="Iris data with example panel functions",
  566.          lower.panel=panel.pts, upper.panel=panel.cor,
  567.          diag.panel=panel.density)
  568.  
  569. #Removing period, since it is highly correlated
  570. nba_keep <- dplyr::select(nba_keep, -PERIOD)
  571. ```
  572.  
  573. ```{r}
  574. set.seed(60)
  575. train.index <- createDataPartition(nba_keep$FGM, p = .75, list = FALSE)
  576. train <- nba_keep[ train.index,]
  577. test  <- nba_keep[-train.index,]
  578. summary(nba_keep)
  579. t.test(train$FINAL_MARGIN, test$FINAL_MARGIN)
  580. ```
  581.  
  582. ```{r}
  583. traindec <- train
  584. testdec <- test
  585. #outliers
  586. traindec.iqr <- traindec
  587.  
  588. # Create a variable to store the row id's to be removed
  589. Outliers <- c()
  590.  
  591. # Loop through the list of columns you specified
  592. for(i in c(1:8)){
  593.  
  594.   # Get the Min/Max values
  595.   max <- quantile(traindec.iqr[,i],0.75, na.rm=TRUE) + (IQR(traindec.iqr[,i], na.rm=TRUE) * 1.5 )
  596.   min <- quantile(traindec.iqr[,i],0.25, na.rm=TRUE) - (IQR(traindec.iqr[,i], na.rm=TRUE) * 1.5 )
  597.  
  598.   # Get the id's using which
  599.   idx <- which(traindec.iqr[,i] < min | traindec.iqr[,i] > max)
  600.  
  601.   # Output the number of outliers in each variable
  602.   print(paste(i, length(idx), sep=''))
  603.  
  604.   # Append the outliers list
  605.   Outliers <- c(Outliers, idx)
  606. }
  607.  
  608. # Sort, I think it's always good to do this
  609. Outliers <- sort(Outliers)
  610.  
  611. # Remove the outliers
  612. traindec.iqr <- traindec.iqr[-Outliers,]
  613.  
  614. #testdec outlier removal
  615. testdec.iqr <- testdec
  616.  
  617. # Create a variable to store the row id's to be removed
  618. Outliers <- c()
  619.  
  620. # Loop through the list of columns you specified
  621. for(i in c(1:8)){
  622.  
  623.   # Get the Min/Max values
  624.   max <- quantile(testdec.iqr[,i],0.75, na.rm=TRUE) + (IQR(testdec.iqr[,i], na.rm=TRUE) * 1.5 )
  625.   min <- quantile(testdec.iqr[,i],0.25, na.rm=TRUE) - (IQR(testdec.iqr[,i], na.rm=TRUE) * 1.5 )
  626.  
  627.   # Get the id's using which
  628.   idx <- which(testdec.iqr[,i] < min | testdec.iqr[,i] > max)
  629.  
  630.   # Output the number of outliers in each variable
  631.   print(paste(i, length(idx), sep=''))
  632.  
  633.   # Append the outliers list
  634.   Outliers <- c(Outliers, idx)
  635. }
  636.  
  637. # Sort, I think it's always good to do this
  638. Outliers <- sort(Outliers)
  639.  
  640. # Remove the outliers
  641. testdec.iqr <- testdec.iqr[-Outliers,]
  642. boxplot(traindec.iqr)
  643.  
  644. #Unnormalized train and test sets
  645. train_final <- traindec.iqr
  646. test_final <- testdec.iqr
  647. ```
  648.  
  649. ##transformation
  650. ```{r}
  651. train <- train_final
  652. test <- test_final
  653.  
  654. #Making the last four variables to be factors
  655. for (i in c(9:12)){
  656.   train[,i] <- as.factor(train[,i])
  657.   test[,i] <- as.factor(test[,i])
  658. }
  659.  
  660. #Making a combined data set so we can check
  661. normality <- bind_rows(train, test)
  662. summary(normality)
  663.  
  664. #Checking the different variables for normality
  665. str(normality)#variables 2 to 9 need to be checked for normality
  666.  
  667. #First we check final margin
  668. hist(normality$FINAL_MARGIN) #normaldist
  669. qqnorm(normality$FINAL_MARGIN) #seems to be normally distributed, but with some deviation in the ends
  670. skewness(normality$FINAL_MARGIN)#Skewness is 0.012868
  671.  
  672. #Shot number
  673. hist(normality$SHOT_NUMBER, breaks = 5)#looks right skewed
  674. qqnorm(normality$SHOT_NUMBER)#looks weird since it is a discrete variable
  675. skewness(normality$SHOT_NUMBER)#Has a skewness of 0.119
  676. skewness(log(normality$SHOT_NUMBER))#natural log makes the skewness -0.39
  677. skewness(sqrt(normality$SHOT_NUMBER))#square root makes the skewness -0.14566
  678. skewness(1/sqrt(normality$SHOT_NUMBER))#Inverse square root makes the skewness 0.5973
  679.  
  680. #Period
  681. hist(normality$PERIOD, breaks = 5)#looks right skewed
  682. qqnorm(normality$PERIOD)#looks weird since it is a discrete variable
  683. skewness(normality$PERIOD)#Has a skewness of 0.119
  684. skewness(log(normality$PERIOD))#natural log makes the skewness -0.39
  685. skewness(sqrt(normality$PERIOD))#square root makes the skewness -0.14566
  686. skewness(1/sqrt(normality$PERIOD))#Inverse square root makes the skewness 0.597
  687.  
  688. #Shot clock
  689. range(normality$SHOT_CLOCK)
  690. normality[which(normality$SHOT_CLOCK == 0),]$SHOT_CLOCK <- 0.001
  691. hist(normality$SHOT_CLOCK)#looks normal distributed with a small increase in the high shot clocks
  692. qqnorm(normality$SHOT_CLOCK)#Looks normal but with deviations in the high and low end
  693. skewness(normality$SHOT_CLOCK)#Has a skewness of 0.0619
  694. skewness(log(normality$SHOT_CLOCK))#natural log makes the skewness -2.496
  695. skewness(sqrt(normality$SHOT_NUMBER))#square root makes the skewness -0.14566
  696. skewness(1/sqrt(normality$SHOT_NUMBER))#Inverse square root makes the skewness 0.597
  697.  
  698. #Touch time
  699. normality[which(normality$TOUCH_TIME == 0),]$TOUCH_TIME <- 0.001
  700. hist(normality$TOUCH_TIME)#looks right skewed
  701. qqnorm(normality$TOUCH_TIME)#Doesn't look normal distributed
  702. skewness(normality$TOUCH_TIME)#Has a skewness of 1.1555
  703. skewness(log(normality$TOUCH_TIME))#natural log makes the skewness -3.236
  704. skewness(sqrt(normality$TOUCH_TIME))#square root makes the skewness 0.4296
  705. skewness(1/sqrt(normality$TOUCH_TIME))#Inverse square root makes the skewness 6.593
  706.  
  707. #Shot distance
  708. normality[which(normality$SHOT_DIST == 0),]$SHOT_DIST <- 0.001
  709. hist(normality$SHOT_DIST)#Doesn't look normal distributed but more like the inverse of a normal distribution with high values in each end
  710. qqnorm(normality$SHOT_DIST)#Looks somewhat normal distributed with a deviation in the middle of the plot and in the low end
  711. skewness(normality$SHOT_DIST)#Has a skewness of 0.12868
  712. skewness(log(normality$SHOT_DIST))#natural log makes the skewness -0.85977
  713. skewness(sqrt(normality$SHOT_DIST))#square root makes the skewness -0.212
  714. skewness(1/sqrt(normality$SHOT_DIST))#Inverse square root makes the skewness 37.2369
  715.  
  716. #Closest defender distance
  717. normality[which(normality$CLOSE_DEF_DIST == 0),]$CLOSE_DEF_DIST <- 0.001
  718. hist(normality$CLOSE_DEF_DIST, breaks = 100)#Has a small reight skew
  719. qqnorm(normality$CLOSE_DEF_DIST)#Looks more or less normal, but with deviations in the ends
  720. skewness(normality$CLOSE_DEF_DIST)#Has a skewness of 0.509
  721. skewness(log(normality$CLOSE_DEF_DIST))#natural log makes the skewness -4.66
  722. skewness(sqrt(normality$CLOSE_DEF_DIST))#square root makes the skewness -0.3789
  723. skewness(1/sqrt(normality$CLOSE_DEF_DIST))#Inverse square root makes the skewness 10.5069
  724.  
  725. #Total game time
  726. hist(normality$TOTAL_GAMETIME, breaks = 100)#doesn't look normal distributed, but looks more or less evenly distributed, but with more values close to 0
  727. qqnorm(normality$TOTAL_GAMETIME)#Doesn't look normal but has gaps in between values indicating outliers
  728. skewness(normality$TOTAL_GAMETIME)#Has a skewness of 0.146
  729. skewness(log(normality$TOTAL_GAMETIME))#natural log makes the skewness -1.5544
  730. skewness(sqrt(normality$TOTAL_GAMETIME))#square root makes the skewness -0.467
  731. skewness(1/sqrt(normality$TOTAL_GAMETIME))#Inverse square root makes the skewness 4.39
  732. ```
  733.  
  734. ##Decision trees
  735.  
  736. Factor transforming
  737. ```{r}
  738. traindec.iqr_noZ <- traindec.iqr
  739. testdec.iqr_noZ <- testdec.iqr
  740. #Setting the dummyvariables to factors
  741. for (i in c(7:9)){
  742.   traindec.iqr[,i] <- as.factor(traindec.iqr[,i])
  743.   testdec.iqr[,i] <- as.factor(testdec.iqr[,i])
  744. }
  745. str(traindec.iqr)
  746. ```
  747.  
  748. ```{r}
  749. #z-score transform train set
  750. for (i in c(1:6)){
  751.   traindec.iqr[,i] <- (traindec.iqr[,i] - mean(traindec.iqr[,i]))/sd(traindec.iqr[,i])
  752. }
  753. summary(traindec.iqr)
  754.  
  755. #z-score transform test set
  756. for (i in c(1:6)){
  757.   testdec.iqr[,i] <- (testdec.iqr[,i] - mean(testdec.iqr[,i]))/sd(testdec.iqr[,i])
  758. }
  759.  
  760. ```
  761.  
  762.  
  763. Decision trees
  764. ```{r}
  765. #RPart decision trees
  766. rpartfit <- train(x=traindec.iqr[,1:8], y=traindec.iqr$FGM, method = "rpart")
  767. rpartfit$bestTune
  768. set.seed(10)
  769. trainrpartcp <- rpart(FGM ~., data = traindec.iqr, method = "class", control = rpart.control(cp = 0.0009221977))
  770. set.seed(10)
  771. trainrpart <- rpart(FGM ~., data = traindec.iqr, method = "class")
  772.  
  773. summary(trainrpartcp)
  774. rpart.plot(trainrpartcp)
  775. set.seed(10)
  776. estrpartFGM <- predict(trainrpartcp, testdec.iqr, type = "class")
  777. table(testdec.iqr$FGM, estrpartFGM)
  778. (13749+4690)/(length(testdec.iqr$SHOT_NUMBER))*100 #Cp = 0.0009221977
  779. (13061+4418)/(length(testdec.iqr$SHOT_NUMBER))*100 #No cp
  780.  
  781. #C5.0 decision trees
  782. x <- traindec.iqr[,1:8]
  783. y <- as.factor(traindec.iqr$FGM)
  784.  
  785. set.seed(10)
  786. trainc50 <- C5.0(x,y, control = C5.0Control(minCases = 10))
  787.  
  788. summary(trainc50)
  789. plot(trainc50)
  790.  
  791. set.seed(10)
  792. estFGM <- predict(trainc50, testdec.iqr, type = "class")
  793. table(estFGM, testdec.iqr$FGM)
  794. (12766+4515)/(length(testdec.iqr$TOUCH_TIME))*100 #No mincases 61.64%
  795. (12765+4509)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 5 61.62%
  796. (12879+4379)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 10 61.64%
  797. (12807+4458)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 15 61.59%
  798. (12875+4387)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 20 61.58%
  799. (12696+4581)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 50 61.63%
  800. (12928+4345)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 100 61.61%
  801. (12765+4480)/(length(testdec.iqr$TOUCH_TIME))*100 #Mincases 200 61.51%
  802. length(testdec.iqr$TOUCH_TIME)
  803. ```
  804.  
  805. Denormalising salient values
  806. ```{r}
  807. shotdist <- -0.9279527*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
  808. shotdist
  809.  
  810. touch <- -0.3420969*sd(traindec.iqr_noZ[,3]) + mean(traindec.iqr_noZ[,3])
  811. touch
  812.  
  813. closestdef <- -0.6427635*sd(traindec.iqr_noZ[,5]) + mean(traindec.iqr_noZ[,5])
  814. closestdef
  815.  
  816. shotclock <- -1.751886*sd(traindec.iqr_noZ[,2]) + mean(traindec.iqr_noZ[,2])
  817. shotclock
  818.  
  819. shotdist2 <- 0.6792926*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
  820. shotdist2
  821.  
  822. shotdist3 <- -0.7593606*sd(traindec.iqr_noZ[,4]) + mean(traindec.iqr_noZ[,4])
  823. shotdist3
  824.  
  825. closestdef2 <- -0.6427635*sd(traindec.iqr_noZ[,5]) + mean(traindec.iqr_noZ[,5])
  826. closestdef2
  827.  
  828.  
  829. shotclock2 <- 1.274499*sd(traindec.iqr_noZ[,2]) + mean(traindec.iqr_noZ[,2])
  830. shotclock2
  831.  
  832. ```
  833.  
  834.  
  835. ```{r}
  836. #neural net on trainingset
  837. #Start by normalizing all the data
  838. nba_train_norm <- train_final
  839. nba_test_norm <- test_final
  840. summary(nba_train_norm)
  841. for (i in c(1:6))
  842. {
  843.   nbamin <- min(nba_train_norm[,i])
  844.   nbamax <- max(nba_train_norm[,i])
  845.   nba_train_norm[,i] <- (nba_train_norm[,i] - nbamin)/(nbamax - nbamin)
  846. }
  847.  
  848. for (i in c(1:6))
  849. {
  850.   nbamin <- min(nba_test_norm[,i])
  851.   nbamax <- max(nba_test_norm[,i])
  852.   nba_test_norm[,i] <- (nba_test_norm[,i] - nbamin)/(nbamax - nbamin)
  853. }
  854. ```
  855.  
  856. ##Neural network analysis
  857. ```{r}
  858. #Preparing the data for final preparation before neural net analysis
  859. nba_train_norm$FGM <- as.factor(nba_train_norm$FGM)
  860. nba_train_norm$PTS_TYPE_2 <- as.numeric(nba_train_norm$PTS_TYPE_2)
  861. nba_train_norm$LOCATION_HOME <- as.numeric(nba_train_norm$LOCATION_HOME)
  862. str(nba_train_norm)
  863.  
  864. nba_test_norm$FGM <- as.factor(nba_test_norm$FGM)
  865. nba_test_norm$PTS_TYPE_2 <- as.numeric(nba_test_norm$PTS_TYPE_2)
  866. nba_test_norm$LOCATION_HOME <- as.numeric(nba_test_norm$LOCATION_HOME)
  867. str(nba_test_norm)
  868. ```
  869. #Neural net with one and two layers
  870. ```{r}
  871. set.seed(40)
  872.  
  873. #setting threshold to 3, since there seems to a local minimum that can't be overcome in some iterations
  874. #Two layer ANN
  875. net.dat <- neuralnet(formula = FGM~.,data=nba_train_norm, rep=5, hidden=c(5,5), linear.output = F, lifesign = "full", lifesign.step = 100, threshold=3, stepmax=10000)
  876.  
  877. #Predicting results
  878. nn.results <- compute(net.dat, nba_test_norm[,-12])
  879. #Plotting the plot
  880. plot(net.dat, rep="best", show.weights = F, col.out = "red", fontsize = 10)
  881.  
  882. #Confusion matrix of results
  883. results <- data.frame(actual = nba_test_norm$FGM, prediction = nn.results$net.result)
  884. results$pred <- ifelse(results[,2] > results[,3], 0, 1)
  885. table(results$actual,results$pred)
  886.  
  887. #One layer ANN
  888. net.dat2 <- neuralnet(formula = FGM~.,data=nba_train_norm, rep=5, hidden=5, linear.output = F, lifesign = "full", lifesign.step = 100, threshold=3, stepmax=10000)
  889. str(nba_train_norm)
  890. #Predicting the result
  891. nn.results2 <- compute(net.dat2, nba_test_norm[,-12])
  892.  
  893. plot(net.dat2, rep="best", show.weights = F, col.out = "red", fontsize = 10)
  894.  
  895.  
  896. #Confusion matrix of results
  897. results2 <- data.frame(actual = nba_test_norm$FGM, prediction = nn.results2$net.result)
  898. results2$pred <- ifelse(results2[,2] > results2[,3], 0, 1)
  899. table(results2$actual,results2$pred)
  900.  
  901. ```
  902.  
  903. #Running neural net with nnet function, allowing us to run the Garson method
  904. ```{r}
  905. require(devtools)
  906. set.seed(50)
  907.  
  908. #size of 5, based on our rule of thumb
  909. nbannet <- nnet(nba_train_norm$FGM~., data = nba_train_norm, size = 5, maxit=1000)
  910.  
  911. #Confusion matrix of results
  912. estFGM <- predict(nbannet, nba_test_norm[,-12], type="class")
  913. estFGM
  914. table(nba_test_norm$FGM, estFGM)
  915. #Similar results to our neuralnet() one-layer
  916.  
  917. #import 'gar.fun' from Github
  918. source_gist('6206737')
  919. cols<-colorRampPalette(c('lightgreen','lightblue'))(length(nba_train_norm[,-11]))
  920. par(mar=c(3,4,1,1),family='serif')
  921.  
  922. #Run garson method
  923. garfunkel <- gar.fun('FGM', nbannet)
  924. garfunkel
  925.  
  926. #Get table of most salient variables by relative importance
  927. x <- garfunkel$data$x.names
  928. y <- garfunkel$data$rel.imp
  929. table <- as.table(setNames(y, x))
  930. table <- sort(table, decreasing = T)
  931. relimp <- as.data.frame(table)
  932. relimp$abs <- abs(relimp$Freq)
  933. relimp <- arrange(relimp, abs, decreasing = T)
  934. relimp$Freq <- round(relimp$Freq, 2)
  935. relimp$abs <- round(relimp$abs, 2)
  936. relimp
  937. ```
  938.  
  939.  
  940. ##KNN
  941. ```{r knnModel}
  942. library(pacman)
  943. p_load(caret, ggplot2)
  944.  
  945. knnTrain = train_final
  946. knnTest = test_final
  947.  
  948.  
  949. # Categoricals to factors
  950. for (i in c(7:9)){
  951.   knnTrain[,i] <- as.factor(knnTrain[,i])
  952.   knnTest[,i] <- as.factor(knnTest[,i])
  953. }
  954.  
  955.  
  956. # Adding names to factors due to error when running train() "one of the class levels is not a valid R variable name"
  957. levels(knnTrain$LOCATION_HOME) = c("A", "H")
  958. levels(knnTrain$FGM) = c("miss", "made")
  959. levels(knnTrain$PTS_TYPE_2) = c("three", "two")
  960.  
  961. levels(knnTest$LOCATION_HOME) = c("A", "H")
  962. levels(knnTest$FGM) = c("miss", "made")
  963. levels(knnTest$PTS_TYPE_2) = c("three", "two")
  964.  
  965. # Setting parameters for tuning repetitions and cross-validation in model
  966. ctrl <- trainControl(
  967.   classProbs = TRUE,
  968.   method = "cv", # cross-validation method
  969.   number = 5 # number of folds
  970.   )
  971.  
  972. # Modeling with training set
  973. fgmknnModel <- train(FGM~., data = knnTrain, method = "knn", tuneGrid=expand.grid(k = 200), trControl = ctrl)
  974. fgmknnModel
  975.  
  976. # Determining and plotting the variable importance
  977. fgmimpvar = varImp(fgmknnModel, scale = T)
  978. ggplot(fgmimpvar)
  979.  
  980. # Predicting shot outcome and establishing final accuracy using test set
  981. TestPred <- predict(fgmknnModel, knnTest)
  982.  
  983. # Creating the confusion matrix using the predicted and test values for FGM
  984. confusionMatrix(TestPred, knnTest$FGM, positive = "made")
  985.  
  986. ```
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top