Untitled

library(RMySQL)
library(DBI)
library(dplyr)
library(class)
library(datasets)
library(ggplot2)
library(stats)
library(utils)
library(lme4)
library(corrplot)
library(ggplot2)
library(stringi)
library(scales)
library(stats)


# Connecting to database

mydb = dbConnect(MySQL(), user='root', password='Pass1234*', dbname='sportsanalytics', host='localhost')

# Visualize tables in the database

dbListTables(mydb)


rav <- function(userId, dat, days) {


  trainingsession_ath %>% subset(date >= (dat-days) &  date <= dat &
                                   aspnetusers_id == userId) %>% select(TL) %>%
    summarize(sum(TL))

}

## Get Dailywellness data

dlywns = dbSendQuery(mydb, "select aspnetusers_id,
                     date,sleepduration,
                     sleepQuality, Nutrition, Stress,
                     Fatigue, musclesoreness
                     from dailywellness")

dailywellness = fetch(dlywns, n=-1)
dailywellness$date <- as.Date(dailywellness$date, format = "%Y-%m-%d")

## get player performance

pp = dbSendQuery(mydb, "select *
                 from playerperformance")

playerperf = fetch(pp, n=-1)
playerperf$Date <- as.Date(playerperf$Date, format = "%Y-%m-%d")


## Get Trainingsessionbyathlete

# TL is variable we create. TL or Training Load = (RPE * Duration)

tsa = dbSendQuery(mydb, "select aspnetusers_id, date, (RPE * Duration) as TL, RPE, Duration as Athlete_Duration
                  from trainingsessionbyathlete
                  group by aspnetusers_id, date
                  order by date desc")

trainingsession_ath = fetch(tsa, n=-1)

# Code below adds a date variable

trainingsession_ath$date <- as.Date(trainingsession_ath$date, format = "%Y-%m-%d")


# The below code is creating a comparison between Training Load (TL) for the last 7 days against the last 28 days.
# This is important to see if an athlete is taking an increased load which in turn could lead to injury.


trainingsession_ath$last7DayTL<- unlist(apply(trainingsession_ath[,c("aspnetusers_id","date")],
                                    1,
                                    function(y) rav(y["aspnetusers_id"],
                                    as.Date(y["date"], format="%Y-%m-%d"),
                                    7)))

trainingsession_ath$last28DayTL<- unlist(apply(trainingsession_ath[,c("aspnetusers_id","date")],
                                    1,
                                    function(y) rav(y["aspnetusers_id"],
                                    as.Date(y["date"], format="%Y-%m-%d"),
                                    28)))


# acl variable will be addressing recent workload on the athlete
# a high number indicates that workload in the last week is higher than the athlete's normal workload

trainingsession_ath$acl <- trainingsession_ath$last7DayTL/ trainingsession_ath$last28DayTL


## Get TrainingSession by Trainer

trn_inj <- dbSendQuery(mydb, "select * from trainingsessionbytrainer
                       order by date desc;")

trainingsession_trn = fetch(trn_inj, n=-1)
trainingsession_trn$Date <- as.Date(trainingsession_trn$Date, format = "%Y-%m-%d")


## Get active roster

roster <- dbSendQuery(mydb, "select firstname,lastname,userId from active_roster;")

active_roster = fetch(roster, n=-1)


###

# Below we are merging data frames! p1 we are merging dailywellness with trainingsession by trainer.
# df2 is merging the existing p1 with trainingsession by athlete
# df3 is merging the existing p2 with active_roster

df1 <- merge.data.frame( dailywellness, trainingsession_trn, by.x=c("aspnetusers_id","date") ,
                       by.y =  c("aspnetusers_Id","Date"))

df2 <- merge.data.frame(df1, trainingsession_ath,by.x=c("aspnetusers_id","date"),
                       by.y=c("aspnetusers_id","date"))

df3 <- merge.data.frame(df2, active_roster, by.x = "aspnetusers_id",
                       by.y = "userId")

# Showing first and last name

df3$name <- paste(df3$firstname,df3$lastname, sep = ",")

str(df3)


# Turning names into factors

df3$name <- factor(df3$name)

df3$Injury <- stri_trans_totitle(df3$Injury)                    #^^^^^^^^^^^^^^^^^^^^^
df3$Injury<- ifelse(df3$Injury =="Null", "", df3$Injury)
df3$Injury<- ifelse(df3$Injury =="","No Injury", df3$Injury)
df3$Injury<- ifelse(df3$Injury =="New","New Injury", df3$Injury)
df3$Injury<- ifelse(df3$Injury =="Existing","Existing Injury", df3$Injury)
df3$Injury<- ifelse(df3$Injury =="None","No Injury", df3$Injury)
df3$Injury <- ifelse(df3$Injury=="No Injury", "No_Injury", df3$Injury)
df3$Injury <- ifelse(df3$Injury=="New Injury", "New_Injury", df3$Injury)
df3$Injury <- ifelse(df3$Injury=="Existing Injury", "Existing_Injury", df3$Injury)


table(df3$Injury)


# Create InjuryFactor variable that is binary - new injury = 1, everything else = 0

df3$InjuryFactor <- ifelse(df3$Injury=="New_Injury", 1 , 0)

df3$Injury_Type <- stri_trans_totitle(df3$Injury_Type)
df3$Injury_Type <- ifelse(df3$Injury_Type=="Null","", df3$Injury_Type)


table(df3$Injury)

# CLEAN UP df3

# Get rid of trainingsessionbytrainerid, firstname, lastname

df3 <- df3[, !(names(df3) %in% c("trainingsessionbytrainerid", "firstname", "lastname"))]    #^^^^^^^^^^^^

# Get rid of sleep greater than or equal to 12

df3 <- subset(df3, df3$sleepduration <=12)

# Converting df3 dataframe to new dataframe with less unnecessary variables

ath.data <- df3[, c("date", "sleepduration", "sleepQuality","Nutrition","Stress","Fatigue","musclesoreness","TL"
                   ,"Duration", "RPE", "Injury", "Injury_Type", "acl")]

# Converting df3 dataframe to new dataframe

ath.data.use <-  df3[ !(names(df3) %in% c("aspnetusers_id","Vertical_Jump","USG","Body_Weight","date","name"))]

ath.data.unscale <- ath.data.use


# Centering and scaling variables ; Duration, TL, Athlete_Duration for better analysis

ath.data.use$Duration <- scale(ath.data.use$Duration,
                               center=median(ath.data.use$Duration),
                               scale=mad(ath.data.use$Duration) )

ath.data.use$TL <- scale(ath.data.use$TL,
                         center=median(ath.data.use$TL),
                         scale=mad(ath.data.use$TL)  )

ath.data.use$last7DayTL <- scale(ath.data.use$last7DayTL,
                                 center=median(ath.data.use$last7DayTL),
                                 scale=mad(ath.data.use$last7DayTL)  )

ath.data.use$last28DayTL <- scale(ath.data.use$last28DayTL,
                                  center=median(ath.data.use$last28DayTL),
                                  scale=mad(ath.data.use$last28DayTL)  )

ath.data.use$Athlete_Duration <- scale(ath.data.use$Athlete_Duration,
                                       center=median(ath.data.use$Athlete_Duration),
                                       scale=mad(ath.data.use$Athlete_Duration)  )


# Added back in date and name variables

ath.data.use <- cbind(ath.data.use, df3$date)
ath.data.use <- cbind(ath.data.use, df3$name)
ath.data.use$date <- ath.data.use$`df3$date`
ath.data.use$name <- ath.data.use$`df3$name`

# Delete unused name and date variables

ath.data.use$`df3$date`   <- NULL
ath.data.use$`df3$name`  <- NULL


# Same thing as above

ath.data.unscale <- cbind(ath.data.unscale, df3$date)
ath.data.unscale <- cbind(ath.data.unscale, df3$name)
ath.data.unscale$date <- ath.data.unscale$`df3$date`
ath.data.unscale$name <- ath.data.unscale$`df3$name`
ath.data.unscale$`df3$date`   <- NULL
ath.data.unscale$`df3$name`  <- NULL


write.csv(ath.data.use, file = "c:\\Capstone Project Share\\R Scripts - MySQL Connection\\ 7 - Cleaning Data for Power BI Connection_John.csv")