Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(RMySQL)
- library(DBI)
- library(dplyr)
- library(class)
- library(datasets)
- library(ggplot2)
- library(stats)
- library(utils)
- library(lme4)
- library(corrplot)
- library(ggplot2)
- library(stringi)
- library(scales)
- library(stats)
- # Connecting to database
- mydb = dbConnect(MySQL(), user='root', password='Pass1234*', dbname='sportsanalytics', host='localhost')
- # Visualize tables in the database
- dbListTables(mydb)
- rav <- function(userId, dat, days) {
- trainingsession_ath %>% subset(date >= (dat-days) & date <= dat &
- aspnetusers_id == userId) %>% select(TL) %>%
- summarize(sum(TL))
- }
- ## Get Dailywellness data
- dlywns = dbSendQuery(mydb, "select aspnetusers_id,
- date,sleepduration,
- sleepQuality, Nutrition, Stress,
- Fatigue, musclesoreness
- from dailywellness")
- dailywellness = fetch(dlywns, n=-1)
- dailywellness$date <- as.Date(dailywellness$date, format = "%Y-%m-%d")
- ## get player performance
- pp = dbSendQuery(mydb, "select *
- from playerperformance")
- playerperf = fetch(pp, n=-1)
- playerperf$Date <- as.Date(playerperf$Date, format = "%Y-%m-%d")
- ## Get Trainingsessionbyathlete
- # TL is variable we create. TL or Training Load = (RPE * Duration)
- tsa = dbSendQuery(mydb, "select aspnetusers_id, date, (RPE * Duration) as TL, RPE, Duration as Athlete_Duration
- from trainingsessionbyathlete
- group by aspnetusers_id, date
- order by date desc")
- trainingsession_ath = fetch(tsa, n=-1)
- # Code below adds a date variable
- trainingsession_ath$date <- as.Date(trainingsession_ath$date, format = "%Y-%m-%d")
- # The below code is creating a comparison between Training Load (TL) for the last 7 days against the last 28 days.
- # This is important to see if an athlete is taking an increased load which in turn could lead to injury.
- trainingsession_ath$last7DayTL<- unlist(apply(trainingsession_ath[,c("aspnetusers_id","date")],
- 1,
- function(y) rav(y["aspnetusers_id"],
- as.Date(y["date"], format="%Y-%m-%d"),
- 7)))
- trainingsession_ath$last28DayTL<- unlist(apply(trainingsession_ath[,c("aspnetusers_id","date")],
- 1,
- function(y) rav(y["aspnetusers_id"],
- as.Date(y["date"], format="%Y-%m-%d"),
- 28)))
- # acl variable will be addressing recent workload on the athlete
- # a high number indicates that workload in the last week is higher than the athlete's normal workload
- trainingsession_ath$acl <- trainingsession_ath$last7DayTL/ trainingsession_ath$last28DayTL
- ## Get TrainingSession by Trainer
- trn_inj <- dbSendQuery(mydb, "select * from trainingsessionbytrainer
- order by date desc;")
- trainingsession_trn = fetch(trn_inj, n=-1)
- trainingsession_trn$Date <- as.Date(trainingsession_trn$Date, format = "%Y-%m-%d")
- ## Get active roster
- roster <- dbSendQuery(mydb, "select firstname,lastname,userId from active_roster;")
- active_roster = fetch(roster, n=-1)
- ###
- # Below we are merging data frames! p1 we are merging dailywellness with trainingsession by trainer.
- # df2 is merging the existing p1 with trainingsession by athlete
- # df3 is merging the existing p2 with active_roster
- df1 <- merge.data.frame( dailywellness, trainingsession_trn, by.x=c("aspnetusers_id","date") ,
- by.y = c("aspnetusers_Id","Date"))
- df2 <- merge.data.frame(df1, trainingsession_ath,by.x=c("aspnetusers_id","date"),
- by.y=c("aspnetusers_id","date"))
- df3 <- merge.data.frame(df2, active_roster, by.x = "aspnetusers_id",
- by.y = "userId")
- # Showing first and last name
- df3$name <- paste(df3$firstname,df3$lastname, sep = ",")
- str(df3)
- # Turning names into factors
- df3$name <- factor(df3$name)
- df3$Injury <- stri_trans_totitle(df3$Injury) #^^^^^^^^^^^^^^^^^^^^^
- df3$Injury<- ifelse(df3$Injury =="Null", "", df3$Injury)
- df3$Injury<- ifelse(df3$Injury =="","No Injury", df3$Injury)
- df3$Injury<- ifelse(df3$Injury =="New","New Injury", df3$Injury)
- df3$Injury<- ifelse(df3$Injury =="Existing","Existing Injury", df3$Injury)
- df3$Injury<- ifelse(df3$Injury =="None","No Injury", df3$Injury)
- df3$Injury <- ifelse(df3$Injury=="No Injury", "No_Injury", df3$Injury)
- df3$Injury <- ifelse(df3$Injury=="New Injury", "New_Injury", df3$Injury)
- df3$Injury <- ifelse(df3$Injury=="Existing Injury", "Existing_Injury", df3$Injury)
- table(df3$Injury)
- # Create InjuryFactor variable that is binary - new injury = 1, everything else = 0
- df3$InjuryFactor <- ifelse(df3$Injury=="New_Injury", 1 , 0)
- df3$Injury_Type <- stri_trans_totitle(df3$Injury_Type)
- df3$Injury_Type <- ifelse(df3$Injury_Type=="Null","", df3$Injury_Type)
- table(df3$Injury)
- # CLEAN UP df3
- # Get rid of trainingsessionbytrainerid, firstname, lastname
- df3 <- df3[, !(names(df3) %in% c("trainingsessionbytrainerid", "firstname", "lastname"))] #^^^^^^^^^^^^
- # Get rid of sleep greater than or equal to 12
- df3 <- subset(df3, df3$sleepduration <=12)
- # Converting df3 dataframe to new dataframe with less unnecessary variables
- ath.data <- df3[, c("date", "sleepduration", "sleepQuality","Nutrition","Stress","Fatigue","musclesoreness","TL"
- ,"Duration", "RPE", "Injury", "Injury_Type", "acl")]
- # Converting df3 dataframe to new dataframe
- ath.data.use <- df3[ !(names(df3) %in% c("aspnetusers_id","Vertical_Jump","USG","Body_Weight","date","name"))]
- ath.data.unscale <- ath.data.use
- # Centering and scaling variables ; Duration, TL, Athlete_Duration for better analysis
- ath.data.use$Duration <- scale(ath.data.use$Duration,
- center=median(ath.data.use$Duration),
- scale=mad(ath.data.use$Duration) )
- ath.data.use$TL <- scale(ath.data.use$TL,
- center=median(ath.data.use$TL),
- scale=mad(ath.data.use$TL) )
- ath.data.use$last7DayTL <- scale(ath.data.use$last7DayTL,
- center=median(ath.data.use$last7DayTL),
- scale=mad(ath.data.use$last7DayTL) )
- ath.data.use$last28DayTL <- scale(ath.data.use$last28DayTL,
- center=median(ath.data.use$last28DayTL),
- scale=mad(ath.data.use$last28DayTL) )
- ath.data.use$Athlete_Duration <- scale(ath.data.use$Athlete_Duration,
- center=median(ath.data.use$Athlete_Duration),
- scale=mad(ath.data.use$Athlete_Duration) )
- # Added back in date and name variables
- ath.data.use <- cbind(ath.data.use, df3$date)
- ath.data.use <- cbind(ath.data.use, df3$name)
- ath.data.use$date <- ath.data.use$`df3$date`
- ath.data.use$name <- ath.data.use$`df3$name`
- # Delete unused name and date variables
- ath.data.use$`df3$date` <- NULL
- ath.data.use$`df3$name` <- NULL
- # Same thing as above
- ath.data.unscale <- cbind(ath.data.unscale, df3$date)
- ath.data.unscale <- cbind(ath.data.unscale, df3$name)
- ath.data.unscale$date <- ath.data.unscale$`df3$date`
- ath.data.unscale$name <- ath.data.unscale$`df3$name`
- ath.data.unscale$`df3$date` <- NULL
- ath.data.unscale$`df3$name` <- NULL
- write.csv(ath.data.use, file = "c:\\Capstone Project Share\\R Scripts - MySQL Connection\\ 7 - Cleaning Data for Power BI Connection_John.csv")
Add Comment
Please, Sign In to add comment