Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(RMySQL) # read SQL into R
- library(dplyr)
- library(MASS)
- drv=dbDriver("MySQL")
- # You need to create a matrix of pitcherid, season, 12 count types so that we can run tests on all these critera. Below is personalized for my database, but can be easily personalized to yours given a pitchfx db is in place.
- con <- dbConnect(drv, user="root", password="root",
- dbname="pitchRx",
- unix.socket="/Applications/MAMP/tmp/mysql/mysql.sock")
- con2 <- dbConnect(drv, user="root", password="root",
- dbname="test",
- unix.socket="/Applications/MAMP/tmp/mysql/mysql.sock")
- temp<-dbGetQuery(con, "SELECT @rownum:=@rownum + 1 as rownum, a.*
- FROM
- (SELECT a.pitch_type, CASE WHEN a.pitch_type = 'FF' or a.pitch_type ='FA' or a.pitch_type = 'FT' or a.pitch_type = 'FS' or a.pitch_type = 'FC' or a.pitch_type = 'SI' then 1 else 0 end as FF, a.gameday_link, a.num, CONCAT(case when a.on_1b !=0 then 1 else 0 end, case when a.on_2b !=0 then 1 else 0 end, case when a.on_3b !=0 then 1 else 0 end,' ', a.count) as state
- FROM pitch a
- Group by a.gameday_link, a.num, a.count) a,
- (SELECT @rownum:=0) r;") # temporary pitch data
- pitcher<-dbGetQuery(con, "SELECT a.pitcher, a.gameday_link, a.num
- From atbat a;") # data on pitcher and game
- base<-dbGetQuery(con2,"SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) as Season, SUBSTR(a.state,5,3) as count, COUNT(*) as N
- FROM data1 a, sampleP b
- Where a.pitcher = b.pitcher and SUBSTR(a.gameday_link,5,4) = b.season
- group by a.pitcher, SUBSTR(a.gameday_link,5,4), SUBSTR(a.state,5,3);")
- data<-merge(pitcher, temp, by=c("gameday_link", "num")) # merge pitch information with pitcher id's
- data_2011_2014<-subset(data, substr(data$gameday_link,5,8) >= 2011 & substr(data$gameday_link,11,11) >= 4 & data$pitch_type != 0 ) # remove null pitch types and preseason data, isolate for seasons of interest
- data_2011_2014<-data_2011_2014[order(data_2011_2014$rownum),] # order the data.frame by data using row number created in "temp"
- count<-substr(data_2011_2014$state,5,7)
- season<-substr(data_2011_2014$gameday_link,5,8)
- data_2011_2014<-cbind(data_2011_2014,count,season)
- playerid<-base$pitcher
- season<-c("2011", "2012", "2013", "2014")
- plyr1<-cbind(base,1,1,1) # creation of matrix with id, season, count, num_pitches, and three empty columns for our metrics
- y=0
- for (g in 1:length(playerid)){# runs test loop
- y=y+1
- player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
- FF<-player.PS$FF
- if (length(FF) <= 1)){
- print(plyr1[g,1])
- print(plyr1[g,2])
- print(plyr1[g,3])
- print("NULL")
- print("runs")
- plyr1[y,5]="NULL"
- }
- else{
- rv<-rep(0,length(FF))
- for (i in 2:length(FF)){
- a<-ifelse(FF[i]==FF[i-1],1,0)
- rv[i]<-a}}
- n<-length(FF)
- pFF<-sum(ifelse(FF==1,1,0))/length(FF)
- obsv<-sum(rv)
- diff<-(abs(obsv-2*n*pFF*(1-pFF)))
- sd<-2*sqrt(2*n)*pFF*(1-pFF)
- stat<-diff/sd
- print(plyr1[g,1])
- print(plyr1[g,2])
- print(plyr1[g,3])
- print(stat)
- print("runs")
- plyr1[y,5]=stat}
- y=0
- for (g in 1:length(playerid)){ # csum test loop
- y=y+1
- player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
- FF<-player.PS$FF
- FF<-2*FF-1
- nFF<-length(FF)
- n=0
- r2<-c(0)
- Sm<-rep(0,length(FF))
- for (i in 1:length(FF)){
- n=n+1
- r1<-c(FF[i])
- r2<-c(r2,r1)
- sr2<-sum(r2)
- Sm[n]=sr2
- }
- z<-max(abs(Sm))/sqrt(nFF)
- plyr1[g,6]=z
- print(g)
- print(plyr1[g,1])
- print(plyr1[g,2])
- print(plyr1[g,3])
- print("CuSUM")
- print(z)}
- y=0
- for (g in 1:length(playerid)){ # streaks test loop
- y=y+1
- player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
- FF<-player.PS$FF
- ST<-replicate(1000,random.mix(FF))
- stat<- sum(streaks(1-FF)^2)
- zRF<-(stat-mean(ST))/sd(ST)
- plyr1[y,7]=zRF
- print(plyr1[g,1])
- print(plyr1[g,2])
- print(plyr1[g,3])
- print("Streaks")
- print(zRF)}
- y=0
- for (g in 1:length(playerid)){ # counter var, number of pitches
- y=y+1
- player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
- FF<-player.PS$FF
- n<-length(FF)
- plyr1[g,6]=n
- print(n)
- print(g)}
- # ------------------------------------------------------ #
- #split half relaiblity test to create pitch cut-off
- playerid2<-`2014relsample`$pitcher # read in pitcherid's of your reliability test sample
- n<-seq(0,100,by=5)
- res<-cbind(n,1)
- res<-res[-1,]
- y=0 #split-half correlation loop
- for (n in res[,1]){
- y=y+1
- print(y)
- cor11<-c(0)
- cor22<-c(0)
- for (g in playerid2){
- player.PS <- subset(data_2014, pitcher == g)
- print(g)
- print(n)
- FF<-player.PS[1:n,]$FF
- popFF<-cbind(1:length(FF),FF)
- pFF<-sum(ifelse(popFF[,2]==1,1,0))/length(popFF[,2])
- FF_even<-subset(popFF, popFF[,1] %% 2 ==0)
- FF_even<-FF_even[,2]
- FF_odd<-subset(popFF, popFF[,1] %% 2 ==1)
- FF_odd<-FF_odd[,2]
- cor1<-sum(FF_odd)
- cor2<-sum(FF_even)
- cor11<-c(cor11,cor1)
- cor22<-c(cor22,cor2)
- }
- cor11<-cor11[-1]
- cor22<-cor22[-1]
- r<-cor(cor11,cor22)
- smb<-(2*r/1+r)
- print(r)
- res[y,2]=r}
- playerid2<-`2014relsample`$pitcher
- n<-seq(0,100,by=5)
- res1<-cbind(n,1)
- res1<-res1[-1,]
- y=0 #split-half correlation loop
- for (n in res[,1]){
- y=y+1
- print(y)
- diff_odd<-c(0)
- diff_even<-c(0)
- for (g in playerid2){
- player.PS <- subset(data_2014, pitcher == g)
- print(g)
- print(n)
- FF<-player.PS[1:n,]$FF
- popFF<-cbind(1:length(FF),FF)
- pFF<-sum(ifelse(popFF[,2]==1,1,0))/length(popFF[,2])
- FF_even<-subset(popFF, popFF[,1] %% 2 ==0)
- FF_even1<-FF_even[,2]
- FF_odd<-subset(popFF, popFF[,1] %% 2 ==1)
- FF_odd1<-FF_odd[,2]
- FF_even<- cbind(1:length(FF_even),1)
- FF_odd<- cbind(1:length(FF_even),1)
- for (i in 1:length(FF_even)){
- FF1_even<-ifelse(FF_even1[i]+FF_odd1[i-1]==2, 1,0)
- sum(FF1_even)
- FF2_even<-sum(FF1)
- FF_even[y,2]=sum(FF1)
- FF2inrow<-sum(FF_2[,2])/(length(FF)-2)
- pFF<-sum(FF_even1==1)/(length(FF_even1)-2)
- diff1=pFF^2-FF2inrow
- print(diff1)
- print(c)
- counts[y,3]=diff1}}
- }
- cor11<-cor11[-1]
- cor22<-cor22[-1]
- r<-cor(cor11,cor22)
- smb<-(2*r/1+r)
- print(r)
- res[y,2]=r}
- write.csv(plyr1, "~/Desktop/statsY2Y.csv")
- #---------------
- #Extra stuff for you to play with
- numpitches<-dbGetQuery(con1, "SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) as season , count(distinct a.pitch_type)
- FROM data1 a
- group by a.pitcher, SUBSTR(a.gameday_link,5,4);")
- sampledataY2Y<-dbGetQuery(con1, "SELECT a.pitcher, a.season, a.N, a.csum, a.std_csum, a.wFB, a.FIP, a.`wFB/C` as wFBc, a.FBp, a.FBv, a.wP, b.season as season2, b.N as N1, b.csum as csum1, b.std_csum as std_csum1, b.wFB as wFB1, b.FIP as FIP1, b.`wFB/C` as wFBc1, b.FBp as FBp1, b.FBv as FBv1, b.wP as wP1
- FROM
- (SELECT a.*
- From sampledata a
- where a.season >=2011) a
- INNER JOIN
- (SELECT a.*
- From sampledata a
- where a.season >= 2012) b
- on a.pitcher = b.pitcher and a.season = b.season -1
- group by a.pitcher, a.season")
- numfastballs<-dbGetQuery(con1, "SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) asseason , count(distinct a.vFF)
- FROM vFF a
- group by SUBSTR(a.gameday_link,5,4), a.pitcher;
- ;")
- streaks<-function(y){ # creation of streaks function from Jim Albert and Max Marchi
- n <- length(y)
- where <- c(0,y,0) == 0
- location.zeroes <- (0 : (n+1))[where]
- streak.lengths<- diff(location.zeroes) - 1
- streak.lengths[streak.lengths > 0]
- }
- random.mix<- function(y){ # creation of random.mix function from Jim Albert and Max Marchi
- clump.stat <- function(sp) sum(sp^2)
- mixed <- sample(y)
- clump.stat(streaks(1-mixed))
- }
- clump.test<-function(playerid, data){ # creation of clump.test function from Jim Albert and Max Marchi
- player.PS <- subset(data, pitcher == 519455)
- FF<-player.PS$FF
- ST<-replicate(1000,random.mix(FF))
- truehist(ST)
- stat<- sum(streaks(1-FF)^2)
- RF<-(sum(streaks(1-FF)^2)-mean(replicate(1000,random.mix(FF))))
- abline(v=stat, lwd=3)
- text(stat*1.05, 0.0010, "OBSERVED")
- }
- get_FF<-function(p){ #function to get FF vector
- player.PS <- subset(data_2014, pitcher == p )
- FF<-player.PS$FF
- print(FF)}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement