Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################
- # Release_Point_Combo_Home_Away.r #
- ###################################
- # This program estimates the release point of a pitcher using PITCHf/x data.
- # Written by Matthew Mata, May 2015 ([email protected])
- #############
- # LIBRARIES #
- #############
- # This library calls MySQL
- library('RODBC');
- library('calibrate');
- library('diagram');
- # Set the working directory and include the spatial clustering algorithm
- setwd("R Scripts/Release Point Scripts");
- source("inc_k_means_fast.r");
- #############
- # FUNCTIONS #
- #############
- # This function sets the x- and z-coordinates to a distance y
- release_dist <- function(Pitch,y){
- # Define the arrays
- x0 <- Pitch$x0;
- y0 <- Pitch$y0;
- z0 <- Pitch$z0;
- vx0 <- Pitch$vx0;
- vy0 <- Pitch$vy0;
- vz0 <- Pitch$vz0;
- ax <- Pitch$ax;
- ay <- Pitch$ay;
- az <- Pitch$az;
- # Loop over the pitches and move the data for each pitch to distance y
- a <- 0.5*ay;
- b <- vy0;
- c <- y0 - y;
- t_rel <- (-b - sqrt(b^2 - 4*a*c))/(2*a);
- x <- 0.5*ax*t_rel^2 + vx0*t_rel + x0;
- z <- 0.5*az*t_rel^2 + vz0*t_rel + z0;
- # Store the new data as a list and return
- RP <- list(x = x, z = z);
- return(RP);
- }
- # This function calculates the location where the cluster is smallest
- min_cluster <- function(Pitch,Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A){
- x0 <- Pitch$x0[Ind];
- y0 <- Pitch$y0[Ind];
- z0 <- Pitch$z0[Ind];
- vx0 <- Pitch$vx0[Ind];
- vy0 <- Pitch$vy0[Ind];
- vz0 <- Pitch$vz0[Ind];
- ax <- Pitch$ax[Ind];
- ay <- Pitch$ay[Ind];
- az <- Pitch$az[Ind];
- N_p <- length(x0);
- # Set the number of nodes (45 feet to 65 feet in inches)
- N_y <- 241;
- # Initialize the arrays for storing the data at different distances, sample means and covariances, and areas
- R_x <- array(0,dim=c(N_y,N_p));
- R_y <- array(0,dim=c(N_y,N_p));
- R_z <- array(0,dim=c(N_y,N_p));
- mu <- array(0,dim=c(N_y,2));
- sig <- array(0,dim=c(N_y,2,2));
- mu_temp <- array(0,dim=2);
- sig_temp <- array(0,dim=c(2,2));
- ellipse_area <- array(0,dim=N_y);
- # Set the interval of where the release point will occur
- D_y <- seq(45,65,length=N_y);
- a <- 0.5*ay;
- b <- vy0;
- # Loop over the distances
- for (i in 1:N_y){
- # Set the coefficients for use in the Pythagorean theorem to solve a t^2 + b t + c = (Distance in y) for t
- c <- y0 - D_y[i];
- t <- (-b - sqrt(b^2 - 4*a*c))/(2*a);
- # Evaluate the distance at t
- R_x[i,] <- t(0.5*ax*t^2 + vx0*t + x0);
- R_y[i,] <- t(0.5*ay*t^2 + vy0*t + y0);
- R_z[i,] <- t(0.5*az*t^2 + vz0*t + z0);
- # Compute the sample mean and sample covariance
- mu_temp <- array(0,dim=2);
- sig_temp <- array(0,dim=c(2,2));
- mu_temp[1] <- sum(R_x[i,])/N_p;
- mu_temp[2] <- sum(R_z[i,])/N_p;
- sig_temp[1,1] <- sum((R_x[i,] - mu_temp[1])^2);
- sig_temp[1,2] <- sum((R_x[i,] - mu_temp[1])*(R_z[i,] - mu_temp[2]));
- sig_temp[2,1] <- sig_temp[1,2];
- sig_temp[2,2] <- sum((R_z[i,] - mu_temp[2])^2);
- sig_temp <- sig_temp/(N_p-1);
- mu[i,] <- mu_temp;
- sig[i,,] <- sig_temp;
- # Compute the eigenvalues and eigenvectors of the covariance matrix
- ellipse <- eigen(sig[i,,]);
- evalues <- ellipse$values;
- evectors <- ellipse$vectors;
- # Determine two standard deviations along the axes
- ellipse_axes <- 2*sqrt(evalues);
- # Compute the area of the ellipse
- ellipse_area[i] <- pi*ellipse_axes[1]*ellipse_axes[2];
- }
- # Find the index corresponding to the smallest area
- RP_index <- which(ellipse_area == min(ellipse_area));
- # Set up the data for ellipse plotting
- s <- seq(0,2*pi,length=200);
- x_par <- cos(s);
- y_par <- sin(s);
- # Find the minimum ellipse
- ellipse <- eigen(sig[RP_index,,]);
- # Determine the values for plotting the ellipse
- evalues <- ellipse$values;
- evectors <- ellipse$vectors;
- ellipse_axes <- 2*sqrt(evalues);
- x_ellipse <- ellipse_axes[1]*x_par;
- y_ellipse <- ellipse_axes[2]*y_par;
- ellipse_matrix <- rbind(x_ellipse,y_ellipse);
- ellipse_coord <- evectors%*%ellipse_matrix + mu[RP_index,];
- # Set bounds for the plot
- min_x <- min(R_x[RP_index,])-0.15;
- max_x <- max(R_x[RP_index,])+0.15;
- min_y <- min(R_z[RP_index,])-0.15;
- max_y <- max(R_z[RP_index,])+0.15;
- # Compute the release angle
- angle <- release_angle(ellipse);
- # Set the index for where the clustering occurs
- cluster_ind <- round(12*(y_feet-45) + 1,digits=0);
- # Determine the release point
- R_Point <- round(D_y[RP_index],digits=3);
- R_Point_h <- round(mu[RP_index,1],digits=3);
- R_Point_v <- round(mu[RP_index,2],digits=3);
- windows();
- # Plot the functional for the area of the ellipses
- plot(D_y,ellipse_area,type='n',xlab='Distance From Back of Home Plate (Feet)',ylab='Two Standard Deviation Ellipse Area (Square Feet)',
- main = paste("Release Point Functional for ",FIRST," ",LAST," 20",YEAR," ",H_or_A,SIM,"",sep=""),xaxs='i',yaxs='i');
- par(new=TRUE);
- lines(D_y,ellipse_area);
- par(new=FALSE);
- windows();
- # Plot the clustered data at the distance it was clustered
- plot(R_x[cluster_ind,],R_z[cluster_ind,],col="blue",pch=20,xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='Horizontal Location (Feet)',
- ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Pitches at ",y_feet," Ft",SIM,"",sep=""),xaxs='i',yaxs='i');
- windows();
- # Plot the cluster at the estimated release point
- plot(R_x[RP_index,],R_z[RP_index,],col="blue",pch=20,xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='Horizontal Location (Feet)',
- ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Est. Release Point, ",R_Point," Ft",SIM,"",sep=""),xaxs='i',yaxs='i');
- par(new=TRUE);
- lines(ellipse_coord[1,],ellipse_coord[2,],col='black',xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='',ylab='',lwd=2);
- par(new=FALSE);
- # Return the release point and angle as a list
- RP_data <- list( RP_x = R_Point_h, RP_y = R_Point, RP_z = R_Point_v, RP_a = angle, RP_area = ellipse_area[RP_index] );
- return(RP_data);
- }
- # This function determines the release angle
- release_angle <- function(E){
- # Calculate the eigenvalues and eigenvectors
- evals <- E$values;
- evecs <- E$vectors;
- # Choose the smaller of the two eigenvalues and use the geometric definition of the dot product to compute the angle from horizontal
- # This means dotting the corresponding eigenvalue with (1,0), taking the absolute value, and computing the arccosine.
- if (evals[1] < evals[2]){
- angle <- acos(abs(evecs[1,1]));
- }
- else{
- angle <- acos(abs(evecs[1,2]));
- }
- # Return the angle
- angle <- round((180/pi)*angle,digits=3);
- return(angle);
- }
- # This function determines the indices of the data that has a 'pct' percent chance or better
- # of belonging to the primary (largest) cluster.
- Largest_Cluster <- function(K,pct){
- # Determine the primary cluster
- Primary <- which(K$w == max(K$w));
- # Find the points with 5% or better probability of being from the primary cluster
- Lrg <- which(K$C[Primary,] >= pct);
- return(Lrg);
- }
- ########
- # MAIN #
- ########
- # Set the player name (PARAMETER)
- FIRST <- 'Corey';
- LAST <- 'Kluber';
- YEAR <- 13;
- DB <- paste("MLB20",YEAR,sep="");
- # Set the release distance for clustering (PARAMETER)
- y_feet <- 55;
- # Number of points to average over for the rolling average (PARAMETER)
- S <- 75;
- # Set the distance in feet for a change in release location (PARAMETER)
- marker <- 0.75;
- # Set the maximal number of spatial clusters (PARAMETER)
- k_max <- 10;
- # Set HA to "1" for Home and "2" for Away data
- HA <- 1;
- if (HA == 1) {H_or_A <- '(H)'} else {H_or_A <- '(A)'};
- print(paste("Pitcher: ",FIRST," ",LAST,", 20",YEAR," " ,H_or_A,"",sep=""));
- # Set the channel (Database is MLB2014)
- channel <- odbcConnect(DB);
- # Set the query to get the PITCHf/x data and exclude any missing entries (by finding entries missing the px value)
- # Also exclude intentional balls
- Release_Query <- paste("SELECT x0, y0, z0, vx0, vy0, vz0, ax, ay, az FROM pitches WHERE
- ab_id IN (SELECT ab_id FROM atbats WHERE
- pitcher = (SELECT eliasid FROM players WHERE first = '",
- FIRST,"' AND last = '",LAST,"') AND half = ",HA,") AND
- pitch_type <> 'IN' AND px IS NOT NULL",sep="");
- # Download the pitch data
- Release <- sqlQuery(channel,Release_Query);
- # Close the channel
- close(channel);
- # Move the data to y_feet and assign the values of x and z to arrays
- RP <- release_dist(Release,y_feet);
- x_rel <- RP$x;
- z_rel <- RP$z;
- N_p <- length(x_rel);
- # Set bounds for plotting
- x_rel_min <- min(x_rel) - 0.05;
- x_rel_max <- max(x_rel) + 0.05;
- z_rel_min <- min(z_rel) - 0.05;
- z_rel_max <- max(z_rel) + 0.05;
- # Plot unclustered data
- windows();
- plot(x_rel,z_rel,col="blue",pch=20,xlim=c(x_rel_min,x_rel_max),ylim=c(z_rel_min,z_rel_max),xlab='Horizontal Location (Feet)',
- ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", All Pitches at 55 Ft",sep=""),xaxs='i',yaxs='i');
- # Arrays for storing the rolling averages (referred to in the code as "Snake")
- Snake_x <- array(0,dim=(N_p-S+1));
- Snake_z <- array(0,dim=(N_p-S+1));
- # Compute the rolling averages
- for (i in 1:(N_p-S+1)){
- Snake_x[i] <- mean(x_rel[c(i:(S-1+i))]);
- Snake_z[i] <- mean(z_rel[c(i:(S-1+i))]);
- }
- # Calculate the line integral or length of the snake
- line_int <- 0;
- for (i in 1:(N_p-S)){
- line_int <- line_int + sqrt((Snake_x[i+1]-Snake_x[i])^2 + (Snake_z[i+1]-Snake_z[i])^2);
- }
- # Print the length of the line
- print(paste("Length of Line: ",round(line_int,digits=3),sep=""));
- # Initialize an array for storing the distance between points in the rolling average
- sep_dist <- array(0,dim=(N_p-2*S+1));
- # Calculate the distance between disjoint, adjacent rolling averages
- for (i in 1:(N_p-2*S+1)){
- sep_dist[i] <- sqrt((Snake_x[i] - Snake_x[i+S])^2 + (Snake_z[i] - Snake_z[i+S])^2) - marker;
- }
- # Initialize the number for counting the location changes
- peak_count <- 0;
- # Count the number of peaks as the number of times the distance exceeds the 'marker' for a change
- for (i in 1:(N_p-2*S)){
- if ( sep_dist[i] < 0 && sep_dist[i+1] > 0 ){
- peak_count <- peak_count + 1;
- }
- }
- # Find the location of the maximum distance of the separations
- if ( peak_count != 0){
- loc_change <- array(0,dim=peak_count);
- k <- 1;
- max_dist <- 0;
- for (i in 1:(N_p-2*S+1)){
- if ( sep_dist[i] > max_dist ){
- loc_change[k] <- i + floor(S/2);
- max_dist <- sep_dist[i];
- }
- if ( sep_dist[i] > 0 && sep_dist[i+1] < 0 ){
- k <- k + 1;
- max_dist <- 0;
- }
- }
- }
- # Find bounds for the plot
- x_min <- min(Snake_x) - 0.05;
- x_max <- max(Snake_x) + 0.05;
- z_min <- min(Snake_z) - 0.05;
- z_max <- max(Snake_z) + 0.05;
- # Correct for grammar
- if (peak_count == 1) move_des = 'Move' else move_des = 'Moves';
- # If there is a change (peak count is not zero), output the location where the change occurs
- if (peak_count != 0){
- peak_loc <- array(0,dim=peak_count);
- peak_output <- "Move(s) at Pitch(es): ";
- for (i in 1:peak_count){
- peak_loc[i] <- loc_change[i] + ceiling(S/2);
- peak_output <- paste(peak_output,(loc_change[i] + ceiling(S/2)),sep="");
- if (i < peak_count){
- peak_output <- paste(peak_output,", ",sep="");
- }
- }
- print(peak_output);
- }
- # Plot the snakes, starting point (green), ending point (red), and changes (blue)
- windows();
- plot(Snake_x,Snake_z,type='n',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='Horizontal Location (Feet)',
- ylab='Vertical Location (Feet)',main=paste("",FIRST," ",LAST," 20",YEAR," ",H_or_A," at ",y_feet," Ft, ",S,"-Pitch Average, ",peak_count," ",move_des,"",sep=""),xaxs='i',yaxs='i');
- par(new=TRUE);
- lines(Snake_x,Snake_z,col='black',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='');
- par(new=TRUE);
- plot(Snake_x[1],Snake_z[1],col='green',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
- par(new=TRUE);
- plot(Snake_x[N_p-S+1],Snake_z[N_p-S+1],col='red',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
- if (peak_count != 0){
- for (i in 1:peak_count){
- par(new=TRUE);
- plot(Snake_x[loc_change[i]],Snake_z[loc_change[i]],col='blue',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
- }
- }
- par(new=FALSE);
- # Set up parameters for plotting with more than one release point location
- if (peak_count != 0){
- # The number of pitches in each cluster
- pitch_subcount <- array(0,dim=(peak_count+1));
- # The breaks between clusters
- peak_loc <- c(1,peak_loc,N_p+1);
- # Fill in the sub-count
- for (i in 1:(peak_count+1)){
- pitch_subcount[i] <- peak_loc[i+1] - peak_loc[i];
- }
- # Set up the colours for plotting the clusters (more colours can be added if needed)
- color_wheel <- c('red', 'blue', 'green', 'orange', 'black', 'purple', 'brown', 'pink', 'gray', 'goldenrod');
- }
- # Check if more than one cluster
- if (peak_count != 0){
- windows();
- plot(Snake_x,Snake_z,type='n',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='Horizontal Location (Feet)',
- ylab='Vertical Location (Feet)',main=paste("",FIRST," ",LAST," 20",YEAR," ",H_or_A," at ",y_feet," Ft, ",S,"-Pitch Average, ",peak_count," ",move_des,"",sep=""),
- xaxs='i',yaxs='i');
- # The breaks between clusters
- for (i in 1:(peak_count+1)){
- par(new=TRUE);
- num_elem <- max(peak_loc[i+1] - peak_loc[i] - S,1);
- Snake_x_temp <- array(0,dim=num_elem);
- Snake_z_temp <- array(0,dim=num_elem);
- avg_size <- min((S-1),(peak_loc[i+1] - peak_loc[i] - 1));
- for (j in peak_loc[i]:(peak_loc[i] + num_elem - 1)){
- Snake_x_temp[j-peak_loc[i]+1] <- mean(x_rel[c(j:(avg_size+j))]);
- Snake_z_temp[j-peak_loc[i]+1] <- mean(z_rel[c(j:(avg_size+j))]);
- }
- lines(Snake_x_temp,Snake_z_temp,col=color_wheel[i],xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='');
- par(new=TRUE);
- plot(Snake_x_temp[1],Snake_z_temp[1],col='green',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
- par(new=TRUE);
- plot(Snake_x_temp[num_elem],Snake_z_temp[num_elem],col='red',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
- par(new=TRUE);
- textxy(Snake_x_temp[1],Snake_z_temp[1],toString(i),col='black',cex=1,offset=1,m=c(x_min,z_min),lwd=2);
- par(new=TRUE);
- textxy(Snake_x_temp[num_elem],Snake_z_temp[num_elem],toString(i),col='black',cex=1,offset=1,m=c(x_min,z_min),lwd=2);
- }
- par(new=FALSE);
- }
- # If the number of peaks is not zero, process each cluster to find the release point
- if (peak_count != 0){
- windows();
- # Plot the clusters
- for(i in 1:(peak_count+1)){
- plot(x_rel[c(peak_loc[i]:(peak_loc[i+1]-1))],z_rel[c(peak_loc[i]:(peak_loc[i+1]-1))],col=color_wheel[i],
- xlim=c(x_rel_min,x_rel_max),ylim=c(z_rel_min,z_rel_max),pch=20,xlab = '', ylab='',xaxs='i',yaxs='i');
- par(new=TRUE);
- }
- title(main=paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Pitches at ",y_feet," Ft, ",peak_count+1," Temporal Clusters",sep=""),xlab = 'Horizontal Location (Feet)',
- ylab = 'Vertical Location (Feet)');
- par(new=FALSE);
- # Compute results for all clusters
- for (i in 1:(peak_count+1)){
- # Set the indices for the points in the cluster
- C_Ind <- c(peak_loc[i]:(peak_loc[i+1]-1));
- RP_C <- list(x=RP$x[C_Ind],z=RP$z[C_Ind]);
- # Cluster the data to remove any outliers/extra clusters
- K <- inc_k_means_fast(RP_C,10^(-4),500,k_max);
- # Get the indices related to the largest cluster
- L_Ind <- Largest_Cluster(K,0.05);
- Release_C <- list( x0=Release$x0[C_Ind], y0=Release$y0[C_Ind], z0=Release$z0[C_Ind],
- vx0=Release$vx0[C_Ind], vy0=Release$vy0[C_Ind], vz0=Release$vz0[C_Ind],
- ax=Release$ax[C_Ind], ay=Release$ay[C_Ind], az=Release$az[C_Ind]);
- # Plot the clusters and output the results to the console
- SIM <- paste(", Cluster #",i,"",sep="");
- T_loc <- min_cluster(Release_C,L_Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A);
- print(paste("Cluster #",i," Percentage: ",length(L_Ind),"/",length(C_Ind)," = ",round(100*length(L_Ind)/length(C_Ind),digits=1),"%",sep=""));
- print(paste("Cluster #",i," Distance: (",T_loc$RP_x,", ",T_loc$RP_y,", ",T_loc$RP_z,")",sep=""));
- print(paste("Cluster #",i," Angle: ",T_loc$RP_a,sep=""));
- print(paste("Cluster #",i," Area: ",round(T_loc$RP_area,digits=3),sep=""));
- }
- }
- # Compute the results for a single cluster
- if(peak_count == 0){
- # Cluster the data to remove any outliers/extra clusters
- K <- inc_k_means_fast(RP,10^(-4),500,k_max);
- # Get the indices related to the largest cluster
- L_Ind <- Largest_Cluster(K,0.05);
- SIM <- '';
- T_loc <- min_cluster(Release,L_Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A);
- print(paste("Cluster Percentage: ",length(L_Ind),"/",N_p," = ",round(100*length(L_Ind)/N_p,digits=1),"%",sep=""));
- print(paste("Cluster Distance: (",T_loc$RP_x,", ",T_loc$RP_y,", ",T_loc$RP_z,")",sep=""));
- print(paste("Cluster Angle: ",T_loc$RP_a,sep=""));
- print(paste("Cluster Area: ",round(T_loc$RP_area,digits=3),sep=""));
- }
- ###################
- # RESET DIRECTORY #
- ###################
- setwd("~");
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement