Untitled


#To do
#Added Variable Plots to see if the relationship changes compare to usual bivariate plot
#VIF and dropping of variables
  #A regression coefficient is not significant even though, theoretically, that variable should be highly correlated with Y.
  #When you add or delete an X variable, the regression coefficients change dramatically.
  #You see a negative regression coefficient when your response should increase along with X.
  #You see a positive regression coefficient when the response should decrease as X increases.
  #Your X variables have high pairwise correlations.

tennis<-read.csv("data/project2/tennis_2013_2017_GS.csv")  %>% mutate(points_diff = log(w_pointswon)-log(l_pointswon), ratio = w_pointswon/(l_pointswon+1)) %>% select(-c(w_pointswon,l_pointswon))
#Point_diff looks like normally distributed

#pipeline to remove columns like ID's
tennis_Cleaned<-tennis %>% select(-c("match_id.x","X","match_id.y","winner_name","loser_name","winner_ioc","a1","a2","player1","player2","slam","week","ratio","l_n_netpt","l_n_netpt_w"))
#Model with as it is predictors
model_dumb<- lm(points_diff ~., data = tennis_Cleaned)
summary(model_dumb)
#Pipeline to remove null and 0's

library(corrplot)
library(tidyverse)
library(GGally)
tennis_Unfeatured<- tennis_Cleaned
#Checking whether or not the response is normally distributed
ggplot(data = tennis , aes(x = ratio) ) + geom_histogram(aes(y =..density.. ))+ geom_density( alpha=.2, fill="#FF6666")
#numeric Predictors
numPredictors<- unlist(map_lgl(tennisPipe,is.numeric))
View(numPredictors)
#Thinking about the categorical predictors with too many levels

#Function to create winning streak ??


#Mutating new Columns
tennis_feautures <- tennis_Cleaned %>% mutate(diffAces = (tennis$w_n_aces/(tennis$l_n_aces+1))^2,
                  Win_percentServeWon = tennis$w_n_sv_w/tennis$w_n_sv,
                  Win_percentAcesPmatch = (tennis$w_n_aces/tennis$w_n_sv)^2,
                  diffServSpeed = tennis$w_ave_serve_speed-(tennis$l_ave_serve_speed),
                  Win_perBreakWon = tennis$w_n_bp_w/(tennis$w_n_bp+1),
                  Win_perNetPoints = tennis$w_n_netpt_w/(tennis$w_n_netpt+1),
                  Los_percentServeWon = tennis$l_n_sv_w/tennis$l_n_sv,
                  Los_percentAcesPmatch = (tennis$l_n_aces/tennis$l_n_sv)^2,
                  Los_perBreakWon = tennis$l_n_bp_w/(tennis$l_n_bp+1),
                  Los_perNetPoints = tennis$l_n_netpt_w/tennis$l_n_netpt,
                  rankDifferential = winner_rank/loser_rank,
                  UE_diff = w_n_ue - l_n_ue,
                  CourtType = case_when(
                    tournament == "Australian Open" ~ "Hard_Court",
                    tournament == "French Open" ~ "Clay",
                    tournament == "US Open" ~ "Hard_Court",
                    tournament == "Wimbledon" ~ "Grass_Court"
                  )
                  )

#French Open  = Clay(0), US open, Aus Open= hard courts(1) , Wimbledon = grass court(2)
model1DF<- tennis_feautures %>% select(c(rankDifferential,winner_age,diffAces,Win_percentServeWon,
                                         Win_percentAcesPmatch,diffServSpeed,Win_perBreakWon,Win_perNetPoints,
                                        Los_percentServeWon,Los_percentAcesPmatch,Los_perBreakWon,CourtType,points_diff,Tour,round,UE_diff))


set.seed(101) # Set Seed so that same sample can be reproduced in future also

# Now Selecting 70% of data as sample from total 'n' rows of the data
sample <- sample.int(n = nrow(model1DF), size = floor(.70*nrow(model1DF)), replace = F)
train <- model1DF[sample, ]
test  <- model1DF[-sample, ]
str(model1DF)
#EDA for Model1DF
library("GGally")
ggcorr(tennis_Cleaned, palette = "RdYlGn" ,
       label = TRUE, label_color = "black" , label_alpha = TRUE,legend.size = 6,hjust = 0.99, layout.exp = 2)
ggpairs(model1DF)


#Distribution of the numeric values
model1DF %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
  facet_wrap(~ key, scales = "free") +   # In separate panels
  geom_density()

# Nulls on Model1DF: Becuase of transformations?

map(model1DF,~sum(is.na(.x)))

#Transformation to make it look normal necessary=> diffAces,Los_percentAcesPmatch,Win_percentAcesPmatch,rankDifferential
#CourtType


#Regression  Model
#We are regressing with the new variables only...
#Model1 is some variable from data with our variables/features
#Pointdiff is actually difference of Points
model_1<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
            diffServSpeed + Win_perBreakWon +
            Los_percentServeWon + Los_perBreakWon + CourtType + UE_diff ,data = train)

write.csv(train,"Project2train.csv")

summary(model_1)
#Predictions
test$pred <- predict(model_1,test)
ggplot(data = test, mapping = aes(x=test$points_diff,y=pred,color = test$Tour)) + geom_point()
#ggplot visuals for models
library(purrr)
library(broom)
modelInfo<-model_1 %>% augment(Data = TRUE)
View(modelInfo)

#fitted vs std.residuals
p <- ggplot(data = modelInfo,
            mapping = aes(x = modelInfo$.fitted, y = modelInfo$.std.resid))
p + geom_point()  + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
  geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal()
#Getting the rows that are probably outliers
log<-modelInfo$.std.resid>=6.023
(modelInfo[log,]$.rownames)

#Cook's distance
modelInfo$idu <- as.numeric(row.names(modelInfo))
C<- ggplot(data = modelInfo,mapping = aes(y=modelInfo$.resid, x = modelInfo$.fitted))
C+geom_point()
 #VIF Model
vif(model_1)
#Look at the individual players with original row
#Inversted U shaped residuals suggests some kind of non linear interaction going on....
#Estimate Plotting
View(modelInfo)
tidyModel1<-model_1 %>% tidy()

tidyModel1%>%ggplot(aes(x = term, y = estimate)) +
  geom_hline(aes(yintercept = 0), linetype = "dashed") +
  geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
                width = .1) +
  geom_point(aes(color = term), size = 3) +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "none")
#CERES PLOT
library(car)
ceresPlots(model_1)


#QQplot for model_1
ggplot(modelInfo, aes(sample= modelInfo$.resid, colour = factor(modelInfo$CourtType))) +
  stat_qq() +
  stat_qq_line()

library(caret)


#Model Visualization
ggplot(data = modelInfo,
            mapping = aes(y = modelInfo$.fitted,x = ,color = factor(Tour))) + geom_line() +  ggtitle("Linear trend + ")
ggplot(data = modelInfo,
       mapping = aes(y = .resid,x = points_diff,color = factor(Tour))) + geom_line() +  ggtitle("NonLinear trend + ")
#Cooks Distance Visualization
max(modelInfo$.cooksd)

ggplot(data = modelInfo,
       mapping = aes(y = .resid, x = Tour)) + geom_point() + facet_wrap(~CourtType)
#Bootstrapping
library(rsample)
library(purrr)
library(broom)
library(ggthemes)
bootreg = train %>%
  bootstraps(1000) %>%
  pull(splits) %>%
  map_dfr(~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
            diffServSpeed + Win_perBreakWon +
            Los_percentServeWon + Los_perBreakWon + CourtType ,data = .) %>%
            tidy())

summarize = dplyr::summarize

View(bootreg)
bootreg %>%
  group_by(term) %>%
  summarize(low=quantile(estimate, .025),
            high=quantile(estimate, .975))

#VIF of Model


bootstrapInfo<-bootreg %>% group_by(term) %>% nest()
hist_estimate<-function(tb){

  ggplot(data = tb, mapping = aes(estimate)) + geom_histogram()
}
#preferred
bootstrapInfo<- bootstrapInfo %>% mutate(plot = map2(data,term,~ggplot(data = .x) + theme_minimal() + geom_histogram(aes(estimate))+ggtitle(.y)))

bootstrapInfo$plot
#Make a non transformed predictor model which are not corelated and then do anova with the feature one
Anova(model_2,model_1)

#Making different model for the men and women....as the residual showed some
fit_model<- function(df){
  model<-lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon +
              diffServSpeed + Win_perBreakWon +
              Los_percentServeWon + Los_perBreakWon + CourtType ,data = df)
  return(model)
}
nested<-tennis_feautures %>% group_by(Tour) %>% nest()
nested<- nested%>% mutate(model = map(data,~lm(points_diff~ diffAces+Los_percentAcesPmatch + Win_percentAcesPmatch  + Win_percentServeWon + diffServSpeed + Win_perBreakWon +
    Los_percentServeWon + Los_perBreakWon + CourtType ,data = .)))
get_vif <- function(mod) vif(mod)
View(nested)
VIF<-nested%>% mutate(vif = map_df(model,get_vif))
get_rsq <- function(mod) glance(mod)$r.squared
nested <- nested %>% mutate(r.squared = map_dbl(model, get_rsq))

View(nested)
nested<- nested %>% mutate(tidy_model = map(model,broom::tidy), augment_model = map(model,broom::augment))

nested
#Visualizing side by side
table<- nested %>% unnest(tidy_model,.drop = T)
augmentTable<- nested %>% unnest(augment_model)
View(augmentTable)


#Visualizing estimates for Two different model based on Tour
table%>%
  ggplot(aes(x = term, y = estimate)) +
  geom_hline(aes(yintercept = 0), linetype = "dashed") +
  geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error),
                width = .2) +
  geom_point(aes(color = term), size = 2) +
  coord_flip() +
  facet_wrap(~Tour,scales = "free") +
  theme_classic() +
  theme(legend.position = "none")
#Plotting fitted by residual side by side for two different model
p <- ggplot(data = augmentTable,
            mapping = aes(x = augmentTable$.fitted, y = augmentTable$.std.resid))
p + geom_point()  + geom_hline(yintercept=0, color= "Black", size= 1.2) + geom_hline(yintercept=6.02391, color= "Yellow", size= 1) +
  geom_hline(yintercept=-6.02391, color= "Yellow", size= 1) + theme_minimal() + facet_wrap(~Tour)

#Plotting fitted by residual side by side for two different model