Advertisement
Guest User

Mixing Metrics R Code

a guest
Jan 4th, 2015
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 8.27 KB | None | 0 0
  1. library(RMySQL) # read SQL into R
  2. library(dplyr)
  3. library(MASS)
  4. drv=dbDriver("MySQL")
  5.  
  6. # You need to create a matrix of pitcherid, season, 12 count types so that we can run tests on all these critera. Below is personalized for my database, but can be easily personalized to yours given a pitchfx db is in place.
  7.  
  8. con <- dbConnect(drv, user="root", password="root",
  9.                  dbname="pitchRx",
  10.                  unix.socket="/Applications/MAMP/tmp/mysql/mysql.sock")
  11. con2 <- dbConnect(drv, user="root", password="root",
  12.                  dbname="test",
  13.                  unix.socket="/Applications/MAMP/tmp/mysql/mysql.sock")
  14. temp<-dbGetQuery(con, "SELECT @rownum:=@rownum + 1 as rownum, a.*
  15.                   FROM
  16.                 (SELECT a.pitch_type, CASE WHEN  a.pitch_type = 'FF' or a.pitch_type ='FA' or a.pitch_type = 'FT' or a.pitch_type = 'FS' or a.pitch_type = 'FC' or a.pitch_type = 'SI' then 1 else 0 end as FF, a.gameday_link, a.num, CONCAT(case when a.on_1b !=0 then 1 else 0 end, case when a.on_2b !=0 then 1 else 0 end, case when a.on_3b !=0 then 1 else 0 end,' ', a.count) as state
  17.                  FROM pitch a
  18.                  Group by a.gameday_link, a.num, a.count) a,
  19.                 (SELECT @rownum:=0) r;")  # temporary pitch data
  20. pitcher<-dbGetQuery(con, "SELECT a.pitcher, a.gameday_link, a.num
  21.                    From atbat a;") # data on pitcher and game
  22.  
  23. base<-dbGetQuery(con2,"SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) as Season, SUBSTR(a.state,5,3) as count, COUNT(*) as N
  24. FROM data1 a, sampleP b
  25. Where a.pitcher = b.pitcher and SUBSTR(a.gameday_link,5,4)  = b.season
  26. group by a.pitcher, SUBSTR(a.gameday_link,5,4), SUBSTR(a.state,5,3);")
  27.  
  28. data<-merge(pitcher, temp, by=c("gameday_link", "num")) # merge pitch information with pitcher id's
  29. data_2011_2014<-subset(data, substr(data$gameday_link,5,8) >= 2011 & substr(data$gameday_link,11,11) >= 4 & data$pitch_type != 0 ) # remove null pitch types and preseason data, isolate for seasons of interest
  30. data_2011_2014<-data_2011_2014[order(data_2011_2014$rownum),] # order the data.frame by data using row number created in "temp"
  31. count<-substr(data_2011_2014$state,5,7)
  32. season<-substr(data_2011_2014$gameday_link,5,8)
  33. data_2011_2014<-cbind(data_2011_2014,count,season)
  34.  
  35. playerid<-base$pitcher
  36. season<-c("2011", "2012", "2013", "2014")
  37. plyr1<-cbind(base,1,1,1) # creation of matrix with id, season, count, num_pitches, and three empty columns for our metrics
  38.  
  39. y=0
  40. for (g in 1:length(playerid)){# runs test loop
  41.   y=y+1
  42.   player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
  43.   FF<-player.PS$FF
  44.   if (length(FF) <= 1)){
  45.     print(plyr1[g,1])
  46.     print(plyr1[g,2])
  47.     print(plyr1[g,3])
  48.     print("NULL")
  49.     print("runs")
  50.     plyr1[y,5]="NULL"
  51.   }
  52.   else{
  53.   rv<-rep(0,length(FF))
  54.   for (i in 2:length(FF)){
  55.     a<-ifelse(FF[i]==FF[i-1],1,0)
  56.     rv[i]<-a}}
  57.   n<-length(FF)
  58.   pFF<-sum(ifelse(FF==1,1,0))/length(FF)
  59.   obsv<-sum(rv)
  60.   diff<-(abs(obsv-2*n*pFF*(1-pFF)))
  61.   sd<-2*sqrt(2*n)*pFF*(1-pFF)
  62.   stat<-diff/sd
  63.   print(plyr1[g,1])
  64.   print(plyr1[g,2])
  65.   print(plyr1[g,3])
  66.   print(stat)
  67.   print("runs")
  68.   plyr1[y,5]=stat}
  69.  
  70.  
  71.  
  72.  
  73. y=0
  74. for (g in 1:length(playerid)){ # csum test loop
  75.   y=y+1
  76.   player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
  77.   FF<-player.PS$FF
  78.   FF<-2*FF-1
  79.   nFF<-length(FF)
  80.   n=0
  81.   r2<-c(0)
  82.   Sm<-rep(0,length(FF))
  83.   for (i in 1:length(FF)){
  84.     n=n+1
  85.     r1<-c(FF[i])
  86.     r2<-c(r2,r1)
  87.     sr2<-sum(r2)
  88.     Sm[n]=sr2
  89.   }
  90.     z<-max(abs(Sm))/sqrt(nFF)
  91.     plyr1[g,6]=z
  92.     print(g)
  93.     print(plyr1[g,1])
  94.     print(plyr1[g,2])
  95.     print(plyr1[g,3])
  96.     print("CuSUM")  
  97.     print(z)}
  98.  
  99. y=0
  100. for (g in 1:length(playerid)){ # streaks test loop
  101.   y=y+1
  102.   player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
  103.   FF<-player.PS$FF
  104.   ST<-replicate(1000,random.mix(FF))
  105.   stat<- sum(streaks(1-FF)^2)
  106.   zRF<-(stat-mean(ST))/sd(ST)
  107.   plyr1[y,7]=zRF
  108.   print(plyr1[g,1])
  109.   print(plyr1[g,2])
  110.   print(plyr1[g,3])
  111.   print("Streaks")  
  112.   print(zRF)}
  113.  
  114.  
  115. y=0
  116. for (g in 1:length(playerid)){ # counter var, number of pitches
  117.   y=y+1
  118.   player.PS <- subset(data_2011_2014, pitcher == plyr1[g,1] & season == plyr1[g,2] & count == plyr1[g,3])
  119.   FF<-player.PS$FF
  120.   n<-length(FF)
  121.   plyr1[g,6]=n
  122.   print(n)
  123.   print(g)}
  124.  
  125. # ------------------------------------------------------ #
  126. #split half relaiblity test to create pitch cut-off
  127. playerid2<-`2014relsample`$pitcher # read in pitcherid's of your reliability test sample
  128. n<-seq(0,100,by=5)
  129. res<-cbind(n,1)
  130. res<-res[-1,]
  131. y=0  #split-half correlation loop
  132. for (n in res[,1]){
  133.   y=y+1
  134.   print(y)
  135.   cor11<-c(0)
  136.   cor22<-c(0)
  137. for (g in playerid2){
  138.   player.PS <- subset(data_2014, pitcher == g)
  139.   print(g)
  140.   print(n)
  141.   FF<-player.PS[1:n,]$FF
  142.   popFF<-cbind(1:length(FF),FF)
  143.   pFF<-sum(ifelse(popFF[,2]==1,1,0))/length(popFF[,2])
  144.   FF_even<-subset(popFF, popFF[,1] %% 2 ==0)
  145.   FF_even<-FF_even[,2]
  146.   FF_odd<-subset(popFF, popFF[,1] %% 2 ==1)
  147.   FF_odd<-FF_odd[,2]
  148.   cor1<-sum(FF_odd)
  149.   cor2<-sum(FF_even)
  150.   cor11<-c(cor11,cor1)
  151.   cor22<-c(cor22,cor2)
  152. }
  153. cor11<-cor11[-1]
  154. cor22<-cor22[-1]
  155. r<-cor(cor11,cor22)
  156. smb<-(2*r/1+r)
  157. print(r)
  158. res[y,2]=r}
  159.  
  160. playerid2<-`2014relsample`$pitcher
  161. n<-seq(0,100,by=5)
  162. res1<-cbind(n,1)
  163. res1<-res1[-1,]
  164. y=0  #split-half correlation loop
  165. for (n in res[,1]){
  166.   y=y+1
  167.   print(y)
  168.   diff_odd<-c(0)
  169.   diff_even<-c(0)
  170.   for (g in playerid2){
  171.     player.PS <- subset(data_2014, pitcher == g)
  172.     print(g)
  173.     print(n)
  174.     FF<-player.PS[1:n,]$FF
  175.     popFF<-cbind(1:length(FF),FF)
  176.     pFF<-sum(ifelse(popFF[,2]==1,1,0))/length(popFF[,2])
  177.     FF_even<-subset(popFF, popFF[,1] %% 2 ==0)
  178.     FF_even1<-FF_even[,2]
  179.     FF_odd<-subset(popFF, popFF[,1] %% 2 ==1)
  180.     FF_odd1<-FF_odd[,2]
  181.     FF_even<- cbind(1:length(FF_even),1)
  182.     FF_odd<- cbind(1:length(FF_even),1)
  183.     for (i in 1:length(FF_even)){
  184.       FF1_even<-ifelse(FF_even1[i]+FF_odd1[i-1]==2, 1,0)
  185.       sum(FF1_even)
  186.       FF2_even<-sum(FF1)
  187.       FF_even[y,2]=sum(FF1)
  188.     FF2inrow<-sum(FF_2[,2])/(length(FF)-2)
  189.     pFF<-sum(FF_even1==1)/(length(FF_even1)-2)
  190.     diff1=pFF^2-FF2inrow
  191.     print(diff1)
  192.     print(c)
  193.     counts[y,3]=diff1}}
  194.   }
  195.   cor11<-cor11[-1]
  196.   cor22<-cor22[-1]
  197.   r<-cor(cor11,cor22)
  198.   smb<-(2*r/1+r)
  199.   print(r)
  200.   res[y,2]=r}
  201.  
  202.  
  203. write.csv(plyr1, "~/Desktop/statsY2Y.csv")
  204. #---------------
  205. #Extra stuff for you to play with
  206.  
  207. numpitches<-dbGetQuery(con1, "SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) as season , count(distinct a.pitch_type)
  208. FROM data1 a
  209. group by  a.pitcher, SUBSTR(a.gameday_link,5,4);")
  210. sampledataY2Y<-dbGetQuery(con1, "SELECT a.pitcher, a.season, a.N, a.csum, a.std_csum, a.wFB, a.FIP, a.`wFB/C` as wFBc, a.FBp, a.FBv, a.wP, b.season as season2, b.N as N1, b.csum as csum1, b.std_csum as std_csum1, b.wFB as wFB1, b.FIP as FIP1, b.`wFB/C` as wFBc1, b.FBp as FBp1, b.FBv as FBv1, b.wP as wP1
  211. FROM
  212. (SELECT a.*
  213. From sampledata a
  214. where a.season >=2011) a
  215. INNER JOIN
  216. (SELECT a.*
  217. From sampledata a
  218. where a.season >= 2012) b
  219. on a.pitcher = b.pitcher and a.season = b.season -1
  220. group by a.pitcher, a.season")
  221. numfastballs<-dbGetQuery(con1, "SELECT a.pitcher, SUBSTR(a.gameday_link,5,4) asseason , count(distinct a.vFF)
  222. FROM vFF a
  223. group by SUBSTR(a.gameday_link,5,4), a.pitcher;
  224. ;")
  225.  
  226. streaks<-function(y){ # creation of streaks function from Jim Albert and Max Marchi
  227.   n <- length(y)
  228.   where <- c(0,y,0) == 0
  229.   location.zeroes <- (0 : (n+1))[where]
  230.   streak.lengths<- diff(location.zeroes) - 1
  231.   streak.lengths[streak.lengths > 0]
  232. }
  233.  
  234.  
  235. random.mix<- function(y){ # creation of random.mix function from Jim Albert and Max Marchi
  236.   clump.stat <- function(sp) sum(sp^2)
  237.   mixed <- sample(y)
  238.   clump.stat(streaks(1-mixed))
  239. }
  240.  
  241. clump.test<-function(playerid, data){ # creation of clump.test function from Jim Albert and Max Marchi
  242.   player.PS <- subset(data, pitcher == 519455)
  243.   FF<-player.PS$FF
  244.   ST<-replicate(1000,random.mix(FF))
  245.   truehist(ST)
  246.   stat<- sum(streaks(1-FF)^2)
  247.   RF<-(sum(streaks(1-FF)^2)-mean(replicate(1000,random.mix(FF))))
  248.   abline(v=stat, lwd=3)
  249.   text(stat*1.05, 0.0010, "OBSERVED")
  250. }
  251.  
  252. get_FF<-function(p){ #function to get FF vector
  253.   player.PS <- subset(data_2014, pitcher == p )
  254.   FF<-player.PS$FF
  255.   print(FF)}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement