Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #To do
- #Added Variable Plots to see if the relationship changes compare to usual bivariate plot
- #VIF and dropping of variables
- #A regression coefficient is not significant even though, theoretically, that variable should be highly correlated with Y.
- #When you add or delete an X variable, the regression coefficients change dramatically.
- #You see a negative regression coefficient when your response should increase along with X.
- #You see a positive regression coefficient when the response should decrease as X increases.
- #Your X variables have high pairwise correlations.
- tennis<-read.csv("data/project2/tennis_2013_2017_GS.csv") %>% mutate(points_diff = log(w_pointswon)-log(l_pointswon), ratio = w_pointswon/(l_pointswon+1)) %>% select(-c(w_pointswon,l_pointswon))
- #Point_diff looks like normally distributed
- #pipeline to remove columns like ID's
- tennis_Cleaned<-tennis %>% select(-c("match_id.x","X","match_id.y","winner_name","loser_name","winner_ioc","a1","a2","player1","player2","slam","week","ratio","l_n_netpt","l_n_netpt_w"))
- #Model with as it is predictors
- model_dumb<- lm(points_diff ~., data = tennis_Cleaned)
- summary(model_dumb)
- #Pipeline to remove null and 0's
- library(corrplot)
- library(tidyverse)
- library(GGally)
- tennis_Unfeatured<- tennis_Cleaned
- #Checking whether or not the response is normally distributed
- ggplot(data = tennis , aes(x = ratio) ) + geom_histogram(aes(y =..density.. ))+ geom_density( alpha=.2, fill="#FF6666")
- #numeric Predictors
- numPredictors<- unlist(map_lgl(tennisPipe,is.numeric))
- View(numPredictors)
- #Thinking about the categorical predictors with too many levels
- #Function to create winning streak ??
- #Mutating new Columns
- tennis_feautures <- tennis_Cleaned %>% mutate(diffAces = (tennis$w_n_aces/(tennis$l_n_aces+1))^2,
- Win_percentServeWon = tennis$w_n_sv_w/tennis$w_n_sv,
- Win_percentAcesPmatch = (tennis$w_n_aces/tennis$w_n_sv)^2,
- diffServSpeed = tennis$w_ave_serve_speed-(tennis$l_ave_serve_speed),
- Win_perBreakWon = tennis$w_n_bp_w/(tennis$w_n_bp+1),
- Win_perNetPoints = tennis$w_n_netpt_w/(tennis$w_n_netpt+1),
- Los_percentServeWon = tennis$l_n_sv_w/tennis$l_n_sv,
- Los_percentAcesPmatch = (tennis$l_n_aces/tennis$l_n_sv)^2,
- Los_perBreakWon = tennis$l_n_bp_w/(tennis$l_n_bp+1),
- Los_perNetPoints = tennis$l_n_netpt_w/tennis$l_n_netpt,
- rankDifferential = winner_rank/loser_rank,
- UE_diff = w_n_ue - l_n_ue,
- CourtType = case_when(
- tournament == "Australian Open" ~ "Hard_Court",
- tournament == "French Open" ~ "Clay",
- tournament == "US Open" ~ "Hard_Court",
- tournament == "Wimbledon" ~ "Grass_Court"
- )
- )
- #French Open = Clay(0), US open, Aus Open= hard courts(1) , Wimbledon = grass court(2)
- model1DF<- tennis_feautures %>% select(c(rankDifferential,winner_age,diffAces,Win_percentServeWon,
- Win_percentAcesPmatch,diffServSpeed,Win_perBreakWon,Win_perNetPoints,
- Los_percentServeWon,Los_percentAcesPmatch,Los_perBreakWon,CourtType,points_diff,Tour,round,UE_diff))
- set.seed(101) # Set Seed so that same sample can be reproduced in future also
- # Now Selecting 70% of data as sample from total 'n' rows of the data
- sample <- sample.int(n = nrow(model1DF), size = floor(.70*nrow(model1DF)), replace = F)
- train <- model1DF[sample, ]
- test <- model1DF[-sample, ]
- str(model1DF)
- #EDA for Model1DF
- library("GGally")
- ggcorr(tennis_Cleaned, palette = "RdYlGn" ,
- label = TRUE, label_color = "black" , label_alpha = TRUE,legend.size = 6,hjust = 0.99, layout.exp = 2)
- ggpairs(model1DF)
- #Distribution of the numeric values
- model1DF %>%
- keep(is.numeric) %>% # Keep only numeric columns
- gather() %>% # Convert to key-value pairs
- ggplot(aes(value)) + # Plot the values
- facet_wrap(~ key, scales = "free") + # In separate panels
- geom_density()
- # Nulls on Model1DF: Becuase of transformations?
- map(model1DF,~sum(is.na(.x)))
- #Transformation to make it look normal necessary=> diffAces,Los_percentAcesPmatch,Win_percentAcesPmatch,rankDifferential
- #CourtType
- #Regression Model
- #We are regressing with the new variables only...
- #Model1 is some variable from data with our variables/features
- #Pointdiff is actually difference of Points
- model_1<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch + Win_percentServeWon +
- diffServSpeed + Win_perBreakWon +
- Los_percentServeWon + Los_perBreakWon + CourtType + UE_diff ,data = train)
- write.csv(train,"Project2train.csv")
- summary(model_1)
- #Predictions
- test$pred <- predict(model_1,test)
- ggplot(data = test, mapping = aes(x=test$points_diff,y=pred,color = test$Tour)) + geom_point()
- #ggplot visuals for models
- library(purrr)
- library(broom)
- modelInfo<-model_1 %>% augment(Data = TRUE)
- View(modelInfo)
- #fitted vs std.residuals
- p <- ggplot(data = modelInfo,
- mapping = aes(x = modelInfo$.fitted, y = modelInfo$.std.resid))
- p + geom_point() + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
- geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal()
- #Getting the rows that are probably outliers
- log<-modelInfo$.std.resid>=6.023
- (modelInfo[log,]$.rownames)
- #Cook's distance
- modelInfo$idu <- as.numeric(row.names(modelInfo))
- C<- ggplot(data = modelInfo,mapping = aes(y=modelInfo$.resid, x = modelInfo$.fitted))
- C+geom_point()
- #VIF Model
- vif(model_1)
- #Look at the individual players with original row
- #Inversted U shaped residuals suggests some kind of non linear interaction going on....
- #Estimate Plotting
- View(modelInfo)
- tidyModel1<-model_1 %>% tidy()
- tidyModel1%>%ggplot(aes(x = term, y = estimate)) +
- geom_hline(aes(yintercept = 0), linetype = "dashed") +
- geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
- width = .1) +
- geom_point(aes(color = term), size = 3) +
- coord_flip() +
- theme_classic() +
- theme(legend.position = "none")
- #CERES PLOT
- library(car)
- ceresPlots(model_1)
- #QQplot for model_1
- ggplot(modelInfo, aes(sample= modelInfo$.resid, colour = factor(modelInfo$CourtType))) +
- stat_qq() +
- stat_qq_line()
- library(caret)
- #Model Visualization
- ggplot(data = modelInfo,
- mapping = aes(y = modelInfo$.fitted,x = ,color = factor(Tour))) + geom_line() + ggtitle("Linear trend + ")
- ggplot(data = modelInfo,
- mapping = aes(y = .resid,x = points_diff,color = factor(Tour))) + geom_line() + ggtitle("NonLinear trend + ")
- #Cooks Distance Visualization
- max(modelInfo$.cooksd)
- ggplot(data = modelInfo,
- mapping = aes(y = .resid, x = Tour)) + geom_point() + facet_wrap(~CourtType)
- #Bootstrapping
- library(rsample)
- library(purrr)
- library(broom)
- library(ggthemes)
- bootreg = train %>%
- bootstraps(1000) %>%
- pull(splits) %>%
- map_dfr(~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch + Win_percentServeWon +
- diffServSpeed + Win_perBreakWon +
- Los_percentServeWon + Los_perBreakWon + CourtType ,data = .) %>%
- tidy())
- summarize = dplyr::summarize
- View(bootreg)
- bootreg %>%
- group_by(term) %>%
- summarize(low=quantile(estimate, .025),
- high=quantile(estimate, .975))
- #VIF of Model
- bootstrapInfo<-bootreg %>% group_by(term) %>% nest()
- hist_estimate<-function(tb){
- ggplot(data = tb, mapping = aes(estimate)) + geom_histogram()
- }
- #preferred
- bootstrapInfo<- bootstrapInfo %>% mutate(plot = map2(data,term,~ggplot(data = .x) + theme_minimal() + geom_histogram(aes(estimate))+ggtitle(.y)))
- bootstrapInfo$plot
- #Make a non transformed predictor model which are not corelated and then do anova with the feature one
- Anova(model_2,model_1)
- #Making different model for the men and women....as the residual showed some
- fit_model<- function(df){
- model<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch + Win_percentServeWon +
- diffServSpeed + Win_perBreakWon +
- Los_percentServeWon + Los_perBreakWon + CourtType ,data = df)
- return(model)
- }
- nested<-tennis_feautures %>% group_by(Tour) %>% nest()
- nested<- nested%>% mutate(model = map(data,~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch + Win_percentServeWon + diffServSpeed + Win_perBreakWon +
- Los_percentServeWon + Los_perBreakWon + CourtType ,data = .)))
- get_vif <- function(mod) vif(mod)
- View(nested)
- VIF<-nested%>% mutate(vif = map_df(model,get_vif))
- get_rsq <- function(mod) glance(mod)$r.squared
- nested <- nested %>% mutate(r.squared = map_dbl(model, get_rsq))
- View(nested)
- nested<- nested %>% mutate(tidy_model = map(model,broom::tidy), augment_model = map(model,broom::augment))
- nested
- #Visualizing side by side
- table<- nested %>% unnest(tidy_model,.drop = T)
- augmentTable<- nested %>% unnest(augment_model)
- View(augmentTable)
- #Visualizing estimates for Two different model based on Tour
- table%>%
- ggplot(aes(x = term, y = estimate)) +
- geom_hline(aes(yintercept = 0), linetype = "dashed") +
- geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
- width = .2) +
- geom_point(aes(color = term), size = 2) +
- coord_flip() +
- facet_wrap(~Tour,scales = "free") +
- theme_classic() +
- theme(legend.position = "none")
- #Plotting fitted by residual side by side for two different model
- p <- ggplot(data = augmentTable,
- mapping = aes(x = augmentTable$.fitted, y = augmentTable$.std.resid))
- p + geom_point() + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
- geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal() + facet_wrap(~Tour)
- #Plotting fitted by residual side by side for two different model
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement