Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 10.11 KB | None | 0 0
  1.  
  2. #To do
  3. #Added Variable Plots to see if the relationship changes compare to usual bivariate plot
  4. #VIF and dropping of variables
  5.   #A regression coefficient is not significant even though, theoretically, that variable should be highly correlated with Y.
  6.   #When you add or delete an X variable, the regression coefficients change dramatically.
  7.   #You see a negative regression coefficient when your response should increase along with X.
  8.   #You see a positive regression coefficient when the response should decrease as X increases.  
  9.   #Your X variables have high pairwise correlations.
  10.  
  11. tennis<-read.csv("data/project2/tennis_2013_2017_GS.csv")  %>% mutate(points_diff = log(w_pointswon)-log(l_pointswon), ratio = w_pointswon/(l_pointswon+1)) %>% select(-c(w_pointswon,l_pointswon))
  12. #Point_diff looks like normally distributed
  13.  
  14. #pipeline to remove columns like ID's
  15. tennis_Cleaned<-tennis %>% select(-c("match_id.x","X","match_id.y","winner_name","loser_name","winner_ioc","a1","a2","player1","player2","slam","week","ratio","l_n_netpt","l_n_netpt_w"))
  16. #Model with as it is predictors
  17. model_dumb<- lm(points_diff ~., data = tennis_Cleaned)
  18. summary(model_dumb)
  19. #Pipeline to remove null and 0's
  20.  
  21. library(corrplot)
  22. library(tidyverse)
  23. library(GGally)
  24. tennis_Unfeatured<- tennis_Cleaned
  25. #Checking whether or not the response is normally distributed
  26. ggplot(data = tennis , aes(x = ratio) ) + geom_histogram(aes(y =..density.. ))+ geom_density( alpha=.2, fill="#FF6666")
  27. #numeric Predictors
  28. numPredictors<- unlist(map_lgl(tennisPipe,is.numeric))
  29. View(numPredictors)
  30. #Thinking about the categorical predictors with too many levels
  31.  
  32. #Function to create winning streak ??
  33.  
  34.  
  35. #Mutating new Columns
  36. tennis_feautures <- tennis_Cleaned %>% mutate(diffAces = (tennis$w_n_aces/(tennis$l_n_aces+1))^2,
  37.                   Win_percentServeWon = tennis$w_n_sv_w/tennis$w_n_sv,
  38.                   Win_percentAcesPmatch = (tennis$w_n_aces/tennis$w_n_sv)^2,
  39.                   diffServSpeed = tennis$w_ave_serve_speed-(tennis$l_ave_serve_speed),
  40.                   Win_perBreakWon = tennis$w_n_bp_w/(tennis$w_n_bp+1),
  41.                   Win_perNetPoints = tennis$w_n_netpt_w/(tennis$w_n_netpt+1),
  42.                   Los_percentServeWon = tennis$l_n_sv_w/tennis$l_n_sv,
  43.                   Los_percentAcesPmatch = (tennis$l_n_aces/tennis$l_n_sv)^2,
  44.                   Los_perBreakWon = tennis$l_n_bp_w/(tennis$l_n_bp+1),
  45.                   Los_perNetPoints = tennis$l_n_netpt_w/tennis$l_n_netpt,
  46.                   rankDifferential = winner_rank/loser_rank,
  47.                   UE_diff = w_n_ue - l_n_ue,
  48.                   CourtType = case_when(
  49.                     tournament == "Australian Open" ~ "Hard_Court",
  50.                     tournament == "French Open" ~ "Clay",
  51.                     tournament == "US Open" ~ "Hard_Court",
  52.                     tournament == "Wimbledon" ~ "Grass_Court"
  53.                   )
  54.                   )
  55.  
  56. #French Open  = Clay(0), US open, Aus Open= hard courts(1) , Wimbledon = grass court(2)
  57. model1DF<- tennis_feautures %>% select(c(rankDifferential,winner_age,diffAces,Win_percentServeWon,
  58.                                          Win_percentAcesPmatch,diffServSpeed,Win_perBreakWon,Win_perNetPoints,
  59.                                         Los_percentServeWon,Los_percentAcesPmatch,Los_perBreakWon,CourtType,points_diff,Tour,round,UE_diff))
  60.  
  61.  
  62. set.seed(101) # Set Seed so that same sample can be reproduced in future also
  63.  
  64. # Now Selecting 70% of data as sample from total 'n' rows of the data
  65. sample <- sample.int(n = nrow(model1DF), size = floor(.70*nrow(model1DF)), replace = F)
  66. train <- model1DF[sample, ]
  67. test  <- model1DF[-sample, ]
  68. str(model1DF)
  69. #EDA for Model1DF
  70. library("GGally")
  71. ggcorr(tennis_Cleaned, palette = "RdYlGn" ,
  72.        label = TRUE, label_color = "black" , label_alpha = TRUE,legend.size = 6,hjust = 0.99, layout.exp = 2)
  73. ggpairs(model1DF)
  74.  
  75.  
  76. #Distribution of the numeric values
  77. model1DF %>%
  78.   keep(is.numeric) %>%                     # Keep only numeric columns
  79.   gather() %>%                             # Convert to key-value pairs
  80.   ggplot(aes(value)) +                     # Plot the values
  81.   facet_wrap(~ key, scales = "free") +   # In separate panels
  82.   geom_density()  
  83.  
  84. # Nulls on Model1DF: Becuase of transformations?
  85.  
  86. map(model1DF,~sum(is.na(.x)))
  87.  
  88. #Transformation to make it look normal necessary=> diffAces,Los_percentAcesPmatch,Win_percentAcesPmatch,rankDifferential
  89. #CourtType
  90.  
  91.  
  92. #Regression  Model
  93. #We are regressing with the new variables only...
  94. #Model1 is some variable from data with our variables/features
  95. #Pointdiff is actually difference of Points
  96. model_1<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
  97.             diffServSpeed + Win_perBreakWon +
  98.             Los_percentServeWon + Los_perBreakWon + CourtType + UE_diff ,data = train)
  99.  
  100. write.csv(train,"Project2train.csv")
  101.  
  102. summary(model_1)
  103. #Predictions
  104. test$pred <- predict(model_1,test)
  105. ggplot(data = test, mapping = aes(x=test$points_diff,y=pred,color = test$Tour)) + geom_point()
  106. #ggplot visuals for models
  107. library(purrr)
  108. library(broom)
  109. modelInfo<-model_1 %>% augment(Data = TRUE)
  110. View(modelInfo)
  111.  
  112. #fitted vs std.residuals
  113. p <- ggplot(data = modelInfo,
  114.             mapping = aes(x = modelInfo$.fitted, y = modelInfo$.std.resid))
  115. p + geom_point()  + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
  116.   geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal()
  117. #Getting the rows that are probably outliers
  118. log<-modelInfo$.std.resid>=6.023
  119. (modelInfo[log,]$.rownames)
  120.  
  121. #Cook's distance
  122. modelInfo$idu <- as.numeric(row.names(modelInfo))
  123. C<- ggplot(data = modelInfo,mapping = aes(y=modelInfo$.resid, x = modelInfo$.fitted))
  124. C+geom_point()
  125.  #VIF Model
  126. vif(model_1)
  127. #Look at the individual players with original row
  128. #Inversted U shaped residuals suggests some kind of non linear interaction going on....
  129. #Estimate Plotting
  130. View(modelInfo)
  131. tidyModel1<-model_1 %>% tidy()
  132.  
  133. tidyModel1%>%ggplot(aes(x = term, y = estimate)) +
  134.   geom_hline(aes(yintercept = 0), linetype = "dashed") +
  135.   geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
  136.                 width = .1) +
  137.   geom_point(aes(color = term), size = 3) +
  138.   coord_flip() +
  139.   theme_classic() +
  140.   theme(legend.position = "none")
  141. #CERES PLOT
  142. library(car)
  143. ceresPlots(model_1)
  144.  
  145.  
  146. #QQplot for model_1
  147. ggplot(modelInfo, aes(sample= modelInfo$.resid, colour = factor(modelInfo$CourtType))) +
  148.   stat_qq() +
  149.   stat_qq_line()
  150.  
  151. library(caret)
  152.  
  153.  
  154. #Model Visualization
  155. ggplot(data = modelInfo,
  156.             mapping = aes(y = modelInfo$.fitted,x = ,color = factor(Tour))) + geom_line() +  ggtitle("Linear trend + ")
  157. ggplot(data = modelInfo,
  158.        mapping = aes(y = .resid,x = points_diff,color = factor(Tour))) + geom_line() +  ggtitle("NonLinear trend + ")
  159. #Cooks Distance Visualization
  160. max(modelInfo$.cooksd)
  161.  
  162. ggplot(data = modelInfo,
  163.        mapping = aes(y = .resid, x = Tour)) + geom_point() + facet_wrap(~CourtType)
  164. #Bootstrapping
  165. library(rsample)
  166. library(purrr)
  167. library(broom)
  168. library(ggthemes)
  169. bootreg = train %>%
  170.   bootstraps(1000) %>%
  171.   pull(splits) %>%
  172.   map_dfr(~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
  173.             diffServSpeed + Win_perBreakWon +
  174.             Los_percentServeWon + Los_perBreakWon + CourtType ,data = .) %>%
  175.             tidy())
  176.  
  177. summarize = dplyr::summarize
  178.  
  179. View(bootreg)
  180. bootreg %>%
  181.   group_by(term) %>%
  182.   summarize(low=quantile(estimate, .025),
  183.             high=quantile(estimate, .975))
  184.  
  185. #VIF of Model
  186.  
  187.  
  188. bootstrapInfo<-bootreg %>% group_by(term) %>% nest()
  189. hist_estimate<-function(tb){
  190.  
  191.   ggplot(data = tb, mapping = aes(estimate)) + geom_histogram()
  192. }
  193. #preferred
  194. bootstrapInfo<- bootstrapInfo %>% mutate(plot = map2(data,term,~ggplot(data = .x) + theme_minimal() + geom_histogram(aes(estimate))+ggtitle(.y)))
  195.  
  196. bootstrapInfo$plot
  197. #Make a non transformed predictor model which are not corelated and then do anova with the feature one
  198. Anova(model_2,model_1)
  199.  
  200. #Making different model for the men and women....as the residual showed some
  201. fit_model<- function(df){
  202.   model<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
  203.               diffServSpeed + Win_perBreakWon +
  204.               Los_percentServeWon + Los_perBreakWon + CourtType ,data = df)
  205.   return(model)
  206. }
  207. nested<-tennis_feautures %>% group_by(Tour) %>% nest()
  208. nested<- nested%>% mutate(model = map(data,~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon + diffServSpeed + Win_perBreakWon +
  209.     Los_percentServeWon + Los_perBreakWon + CourtType ,data = .)))
  210. get_vif <- function(mod) vif(mod)
  211. View(nested)
  212. VIF<-nested%>% mutate(vif = map_df(model,get_vif))
  213. get_rsq <- function(mod) glance(mod)$r.squared
  214. nested <- nested %>% mutate(r.squared = map_dbl(model, get_rsq))
  215.  
  216. View(nested)
  217. nested<- nested %>% mutate(tidy_model = map(model,broom::tidy), augment_model = map(model,broom::augment))
  218.  
  219. nested
  220. #Visualizing side by side
  221. table<- nested %>% unnest(tidy_model,.drop = T)
  222. augmentTable<- nested %>% unnest(augment_model)
  223. View(augmentTable)
  224.  
  225.  
  226. #Visualizing estimates for Two different model based on Tour
  227. table%>%
  228.   ggplot(aes(x = term, y = estimate)) +
  229.   geom_hline(aes(yintercept = 0), linetype = "dashed") +
  230.   geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
  231.                 width = .2) +
  232.   geom_point(aes(color = term), size = 2) +
  233.   coord_flip() +
  234.   facet_wrap(~Tour,scales = "free") +
  235.   theme_classic() +
  236.   theme(legend.position = "none")
  237. #Plotting fitted by residual side by side for two different model
  238. p <- ggplot(data = augmentTable,
  239.             mapping = aes(x = augmentTable$.fitted, y = augmentTable$.std.resid))
  240. p + geom_point()  + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
  241.   geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal() + facet_wrap(~Tour)
  242.  
  243. #Plotting fitted by residual side by side for two different model
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement