Release Point Algorithm (Display Images)

###################################
# Release_Point_Combo_Home_Away.r #
###################################

# This program estimates the release point of a pitcher using PITCHf/x data.

# Written by Matthew Mata, May 2015 ([email protected])

#############
# LIBRARIES #
#############

# This library calls MySQL
library('RODBC');
library('calibrate');
library('diagram');

# Set the working directory and include the spatial clustering algorithm
setwd("R Scripts/Release Point Scripts");
source("inc_k_means_fast.r");

#############
# FUNCTIONS #
#############

# This function sets the x- and z-coordinates to a distance y
release_dist <- function(Pitch,y){
    # Define the arrays
    x0 <- Pitch$x0;
    y0 <- Pitch$y0;
    z0 <- Pitch$z0;
    vx0 <- Pitch$vx0;
    vy0 <- Pitch$vy0;
    vz0 <- Pitch$vz0;
    ax <- Pitch$ax;
    ay <- Pitch$ay;
    az <- Pitch$az;
    # Loop over the pitches and move the data for each pitch to distance y
    a <- 0.5*ay;
    b <- vy0;
    c <- y0 - y;
    t_rel <- (-b - sqrt(b^2 - 4*a*c))/(2*a);
    x <- 0.5*ax*t_rel^2 + vx0*t_rel + x0;
    z <- 0.5*az*t_rel^2 + vz0*t_rel + z0;
    # Store the new data as a list and return
    RP <- list(x = x, z = z);
    return(RP);
}

# This function calculates the location where the cluster is smallest
min_cluster <- function(Pitch,Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A){
    x0 <- Pitch$x0[Ind];
    y0 <- Pitch$y0[Ind];
    z0 <- Pitch$z0[Ind];
    vx0 <- Pitch$vx0[Ind];
    vy0 <- Pitch$vy0[Ind];
    vz0 <- Pitch$vz0[Ind];
    ax <- Pitch$ax[Ind];
    ay <- Pitch$ay[Ind];
    az <- Pitch$az[Ind];
    N_p <- length(x0);
    # Set the number of nodes (45 feet to 65 feet in inches)
    N_y <- 241;
    # Initialize the arrays for storing the data at different distances, sample means and covariances, and areas
    R_x <- array(0,dim=c(N_y,N_p));
    R_y <- array(0,dim=c(N_y,N_p));
    R_z <- array(0,dim=c(N_y,N_p));
    mu <- array(0,dim=c(N_y,2));
    sig <- array(0,dim=c(N_y,2,2));
    mu_temp <- array(0,dim=2);
    sig_temp <- array(0,dim=c(2,2));
    ellipse_area <- array(0,dim=N_y);
    # Set the interval of where the release point will occur
    D_y <- seq(45,65,length=N_y);
    a <- 0.5*ay;
    b <- vy0;
    # Loop over the distances
    for (i in 1:N_y){
        # Set the coefficients for use in the Pythagorean theorem to solve a t^2 + b t + c = (Distance in y) for t
        c <- y0 - D_y[i];
        t <- (-b - sqrt(b^2 - 4*a*c))/(2*a);
        # Evaluate the distance at t
        R_x[i,] <- t(0.5*ax*t^2 + vx0*t + x0);
        R_y[i,] <- t(0.5*ay*t^2 + vy0*t + y0);
        R_z[i,] <- t(0.5*az*t^2 + vz0*t + z0);
        # Compute the sample mean and sample covariance
        mu_temp <- array(0,dim=2);
        sig_temp <- array(0,dim=c(2,2));
        mu_temp[1] <- sum(R_x[i,])/N_p;
        mu_temp[2] <- sum(R_z[i,])/N_p;
        sig_temp[1,1] <- sum((R_x[i,] - mu_temp[1])^2);
        sig_temp[1,2] <- sum((R_x[i,] - mu_temp[1])*(R_z[i,] - mu_temp[2]));
        sig_temp[2,1] <- sig_temp[1,2];
        sig_temp[2,2] <- sum((R_z[i,] - mu_temp[2])^2);
        sig_temp <- sig_temp/(N_p-1);
        mu[i,] <- mu_temp;
        sig[i,,] <- sig_temp;
        # Compute the eigenvalues and eigenvectors of the covariance matrix
        ellipse <- eigen(sig[i,,]);
        evalues <- ellipse$values;
        evectors <- ellipse$vectors;
        # Determine two standard deviations along the axes
        ellipse_axes <- 2*sqrt(evalues);
        # Compute the area of the ellipse
        ellipse_area[i] <- pi*ellipse_axes[1]*ellipse_axes[2];
    }
    # Find the index corresponding to the smallest area
    RP_index <- which(ellipse_area == min(ellipse_area));
    # Set up the data for ellipse plotting
    s <- seq(0,2*pi,length=200);
    x_par <- cos(s);
    y_par <- sin(s);
    # Find the minimum ellipse
    ellipse <- eigen(sig[RP_index,,]);
    # Determine the values for plotting the ellipse
    evalues <- ellipse$values;
    evectors <- ellipse$vectors;
    ellipse_axes <- 2*sqrt(evalues);
    x_ellipse <- ellipse_axes[1]*x_par;
    y_ellipse <- ellipse_axes[2]*y_par;
    ellipse_matrix <- rbind(x_ellipse,y_ellipse);
    ellipse_coord <- evectors%*%ellipse_matrix + mu[RP_index,];
    # Set bounds for the plot
    min_x <- min(R_x[RP_index,])-0.15;
    max_x <- max(R_x[RP_index,])+0.15;
    min_y <- min(R_z[RP_index,])-0.15;
    max_y <- max(R_z[RP_index,])+0.15;
    # Compute the release angle
    angle <- release_angle(ellipse);
    # Set the index for where the clustering occurs
    cluster_ind <- round(12*(y_feet-45) + 1,digits=0);
    # Determine the release point
    R_Point <- round(D_y[RP_index],digits=3);
    R_Point_h <- round(mu[RP_index,1],digits=3);
    R_Point_v <- round(mu[RP_index,2],digits=3);
    windows();
    # Plot the functional for the area of the ellipses
    plot(D_y,ellipse_area,type='n',xlab='Distance From Back of Home Plate (Feet)',ylab='Two Standard Deviation Ellipse Area (Square Feet)',
         main = paste("Release Point Functional for ",FIRST," ",LAST," 20",YEAR," ",H_or_A,SIM,"",sep=""),xaxs='i',yaxs='i');
    par(new=TRUE);
    lines(D_y,ellipse_area);
    par(new=FALSE);
    windows();
    # Plot the clustered data at the distance it was clustered
    plot(R_x[cluster_ind,],R_z[cluster_ind,],col="blue",pch=20,xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='Horizontal Location (Feet)',
         ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Pitches at ",y_feet," Ft",SIM,"",sep=""),xaxs='i',yaxs='i');
    windows();
    # Plot the cluster at the estimated release point
    plot(R_x[RP_index,],R_z[RP_index,],col="blue",pch=20,xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='Horizontal Location (Feet)',
         ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Est. Release Point, ",R_Point," Ft",SIM,"",sep=""),xaxs='i',yaxs='i');
    par(new=TRUE);
    lines(ellipse_coord[1,],ellipse_coord[2,],col='black',xlim=c(min_x,max_x),ylim=c(min_y,max_y),xlab='',ylab='',lwd=2);
    par(new=FALSE);
    # Return the release point and angle as a list
    RP_data <- list( RP_x = R_Point_h, RP_y = R_Point, RP_z = R_Point_v, RP_a = angle, RP_area = ellipse_area[RP_index] );
    return(RP_data);
}

# This function determines the release angle
release_angle <- function(E){
    # Calculate the eigenvalues and eigenvectors
    evals <- E$values;
    evecs <- E$vectors;
    # Choose the smaller of the two eigenvalues and use the geometric definition of the dot product to compute the angle from horizontal
    # This means dotting the corresponding eigenvalue with (1,0), taking the absolute value, and computing the arccosine.
    if (evals[1] < evals[2]){
        angle <- acos(abs(evecs[1,1]));
    }
    else{
        angle <- acos(abs(evecs[1,2]));
    }
    # Return the angle
    angle <- round((180/pi)*angle,digits=3);
    return(angle);
}

# This function determines the indices of the data that has a 'pct' percent chance or better
# of belonging to the primary (largest) cluster.

Largest_Cluster <- function(K,pct){
    # Determine the primary cluster
    Primary <- which(K$w == max(K$w));
    # Find the points with 5% or better probability of being from the primary cluster
    Lrg <- which(K$C[Primary,] >= pct);
    return(Lrg);
}

########
# MAIN #
########

# Set the player name (PARAMETER)
FIRST <- 'Corey';
LAST <- 'Kluber';
YEAR <- 13;

DB <- paste("MLB20",YEAR,sep="");

# Set the release distance for clustering (PARAMETER)
y_feet <- 55;

# Number of points to average over for the rolling average (PARAMETER)
S <- 75;

# Set the distance in feet for a change in release location (PARAMETER)
marker <- 0.75;

# Set the maximal number of spatial clusters (PARAMETER)
k_max <- 10;

# Set HA to "1" for Home and "2" for Away data
HA <- 1;

if (HA == 1) {H_or_A <- '(H)'} else {H_or_A <- '(A)'};

print(paste("Pitcher: ",FIRST," ",LAST,", 20",YEAR," " ,H_or_A,"",sep=""));

# Set the channel (Database is MLB2014)
channel <- odbcConnect(DB);

# Set the query to get the PITCHf/x data and exclude any missing entries (by finding entries missing the px value)
# Also exclude intentional balls
Release_Query <- paste("SELECT x0, y0, z0, vx0, vy0, vz0, ax, ay, az FROM pitches WHERE
                       ab_id IN (SELECT ab_id FROM atbats WHERE
                       pitcher = (SELECT eliasid FROM players WHERE first = '",
                       FIRST,"' AND last = '",LAST,"') AND half = ",HA,") AND
                       pitch_type <> 'IN' AND px IS NOT NULL",sep="");

# Download the pitch data
Release <- sqlQuery(channel,Release_Query);

# Close the channel
close(channel);

# Move the data to y_feet and assign the values of x and z to arrays
RP <- release_dist(Release,y_feet);
x_rel <- RP$x;
z_rel <- RP$z;
N_p <- length(x_rel);

# Set bounds for plotting
x_rel_min <- min(x_rel) - 0.05;
x_rel_max <- max(x_rel) + 0.05;
z_rel_min <- min(z_rel) - 0.05;
z_rel_max <- max(z_rel) + 0.05;

# Plot unclustered data
windows();
plot(x_rel,z_rel,col="blue",pch=20,xlim=c(x_rel_min,x_rel_max),ylim=c(z_rel_min,z_rel_max),xlab='Horizontal Location (Feet)',
         ylab='Vertical Location (Feet)', main = paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", All Pitches at 55 Ft",sep=""),xaxs='i',yaxs='i');

# Arrays for storing the rolling averages (referred to in the code as "Snake")
Snake_x <- array(0,dim=(N_p-S+1));
Snake_z <- array(0,dim=(N_p-S+1));

# Compute the rolling averages
for (i in 1:(N_p-S+1)){
    Snake_x[i] <- mean(x_rel[c(i:(S-1+i))]);
    Snake_z[i] <- mean(z_rel[c(i:(S-1+i))]);
}

# Calculate the line integral or length of the snake
line_int <- 0;
for (i in 1:(N_p-S)){
    line_int <- line_int + sqrt((Snake_x[i+1]-Snake_x[i])^2 + (Snake_z[i+1]-Snake_z[i])^2);
}

# Print the length of the line
print(paste("Length of Line: ",round(line_int,digits=3),sep=""));

# Initialize an array for storing the distance between points in the rolling average
sep_dist <- array(0,dim=(N_p-2*S+1));

# Calculate the distance between disjoint, adjacent rolling averages
for (i in 1:(N_p-2*S+1)){
        sep_dist[i] <- sqrt((Snake_x[i] - Snake_x[i+S])^2 + (Snake_z[i] - Snake_z[i+S])^2) - marker;
}

# Initialize the number for counting the location changes
peak_count <- 0;

# Count the number of peaks as the number of times the distance exceeds the 'marker' for a change
for (i in 1:(N_p-2*S)){
    if ( sep_dist[i] < 0 && sep_dist[i+1] > 0 ){
        peak_count <- peak_count + 1;
    }
}

# Find the location of the maximum distance of the separations
if ( peak_count != 0){
    loc_change <- array(0,dim=peak_count);
    k <- 1;
    max_dist <- 0;
    for (i in 1:(N_p-2*S+1)){
        if ( sep_dist[i] > max_dist ){
            loc_change[k] <- i + floor(S/2);
            max_dist <- sep_dist[i];
        }
        if ( sep_dist[i] > 0 && sep_dist[i+1] < 0 ){
            k <- k + 1;
            max_dist <- 0;
        }
    }
}

# Find bounds for the plot
x_min <- min(Snake_x) - 0.05;
x_max <- max(Snake_x) + 0.05;
z_min <- min(Snake_z) - 0.05;
z_max <- max(Snake_z) + 0.05;

# Correct for grammar
if (peak_count == 1) move_des = 'Move' else move_des = 'Moves';

# If there is a change (peak count is not zero), output the location where the change occurs
if (peak_count != 0){
    peak_loc <- array(0,dim=peak_count);
    peak_output <- "Move(s) at Pitch(es): ";
    for (i in 1:peak_count){
        peak_loc[i] <- loc_change[i] + ceiling(S/2);
        peak_output <- paste(peak_output,(loc_change[i] + ceiling(S/2)),sep="");
        if (i < peak_count){
            peak_output <- paste(peak_output,", ",sep="");
        }
    }
    print(peak_output);
}

# Plot the snakes, starting point (green), ending point (red), and changes (blue)
windows();
plot(Snake_x,Snake_z,type='n',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='Horizontal Location (Feet)',
     ylab='Vertical Location (Feet)',main=paste("",FIRST," ",LAST," 20",YEAR," ",H_or_A," at ",y_feet," Ft, ",S,"-Pitch Average, ",peak_count," ",move_des,"",sep=""),xaxs='i',yaxs='i');
par(new=TRUE);
lines(Snake_x,Snake_z,col='black',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='');
par(new=TRUE);
plot(Snake_x[1],Snake_z[1],col='green',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
par(new=TRUE);
plot(Snake_x[N_p-S+1],Snake_z[N_p-S+1],col='red',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
if (peak_count != 0){
    for (i in 1:peak_count){
        par(new=TRUE);
        plot(Snake_x[loc_change[i]],Snake_z[loc_change[i]],col='blue',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
    }
}
par(new=FALSE);

# Set up parameters for plotting with more than one release point location
if (peak_count != 0){
    # The number of pitches in each cluster
    pitch_subcount <- array(0,dim=(peak_count+1));
    # The breaks between clusters
    peak_loc <- c(1,peak_loc,N_p+1);
    # Fill in the sub-count
    for (i in 1:(peak_count+1)){
        pitch_subcount[i] <- peak_loc[i+1] - peak_loc[i];
    }
    # Set up the colours for plotting the clusters (more colours can be added if needed)
    color_wheel <- c('red', 'blue', 'green', 'orange', 'black', 'purple', 'brown', 'pink', 'gray', 'goldenrod');
}

# Check if more than one cluster
if (peak_count != 0){
    windows();
    plot(Snake_x,Snake_z,type='n',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='Horizontal Location (Feet)',
         ylab='Vertical Location (Feet)',main=paste("",FIRST," ",LAST," 20",YEAR," ",H_or_A," at ",y_feet," Ft, ",S,"-Pitch Average, ",peak_count," ",move_des,"",sep=""),
         xaxs='i',yaxs='i');
    # The breaks between clusters
    for (i in 1:(peak_count+1)){
        par(new=TRUE);
        num_elem <- max(peak_loc[i+1] - peak_loc[i] - S,1);
        Snake_x_temp <- array(0,dim=num_elem);
        Snake_z_temp <- array(0,dim=num_elem);
        avg_size <- min((S-1),(peak_loc[i+1] - peak_loc[i] - 1));
        for (j in peak_loc[i]:(peak_loc[i] + num_elem - 1)){
            Snake_x_temp[j-peak_loc[i]+1] <- mean(x_rel[c(j:(avg_size+j))]);
            Snake_z_temp[j-peak_loc[i]+1] <- mean(z_rel[c(j:(avg_size+j))]);
        }
        lines(Snake_x_temp,Snake_z_temp,col=color_wheel[i],xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='');
        par(new=TRUE);
        plot(Snake_x_temp[1],Snake_z_temp[1],col='green',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
        par(new=TRUE);
        plot(Snake_x_temp[num_elem],Snake_z_temp[num_elem],col='red',xlim=c(x_min,x_max),ylim=c(z_min,z_max),xlab='',ylab='',pch=15,xaxs='i',yaxs='i');
        par(new=TRUE);
        textxy(Snake_x_temp[1],Snake_z_temp[1],toString(i),col='black',cex=1,offset=1,m=c(x_min,z_min),lwd=2);
        par(new=TRUE);
        textxy(Snake_x_temp[num_elem],Snake_z_temp[num_elem],toString(i),col='black',cex=1,offset=1,m=c(x_min,z_min),lwd=2);
    }
    par(new=FALSE);
}

# If the number of peaks is not zero, process each cluster to find the release point
if (peak_count != 0){
    windows();
    # Plot the clusters
    for(i in 1:(peak_count+1)){
        plot(x_rel[c(peak_loc[i]:(peak_loc[i+1]-1))],z_rel[c(peak_loc[i]:(peak_loc[i+1]-1))],col=color_wheel[i],
             xlim=c(x_rel_min,x_rel_max),ylim=c(z_rel_min,z_rel_max),pch=20,xlab = '', ylab='',xaxs='i',yaxs='i');
             par(new=TRUE);
    }
    title(main=paste(FIRST," ",LAST," 20",YEAR," ",H_or_A,", Pitches at ",y_feet," Ft, ",peak_count+1," Temporal Clusters",sep=""),xlab = 'Horizontal Location (Feet)',
          ylab = 'Vertical Location (Feet)');
    par(new=FALSE);
    # Compute results for all clusters
    for (i in 1:(peak_count+1)){
        # Set the indices for the points in the cluster
        C_Ind <- c(peak_loc[i]:(peak_loc[i+1]-1));
        RP_C <- list(x=RP$x[C_Ind],z=RP$z[C_Ind]);
        # Cluster the data to remove any outliers/extra clusters
        K <- inc_k_means_fast(RP_C,10^(-4),500,k_max);
        # Get the indices related to the largest cluster
        L_Ind <- Largest_Cluster(K,0.05);
        Release_C <- list( x0=Release$x0[C_Ind], y0=Release$y0[C_Ind], z0=Release$z0[C_Ind],
                           vx0=Release$vx0[C_Ind], vy0=Release$vy0[C_Ind], vz0=Release$vz0[C_Ind],
                           ax=Release$ax[C_Ind], ay=Release$ay[C_Ind], az=Release$az[C_Ind]);
        # Plot the clusters and output the results to the console
        SIM <- paste(", Cluster #",i,"",sep="");
        T_loc <- min_cluster(Release_C,L_Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A);
        print(paste("Cluster #",i," Percentage: ",length(L_Ind),"/",length(C_Ind)," = ",round(100*length(L_Ind)/length(C_Ind),digits=1),"%",sep=""));
        print(paste("Cluster #",i," Distance: (",T_loc$RP_x,", ",T_loc$RP_y,", ",T_loc$RP_z,")",sep=""));
        print(paste("Cluster #",i," Angle: ",T_loc$RP_a,sep=""));
        print(paste("Cluster #",i," Area: ",round(T_loc$RP_area,digits=3),sep=""));
    }
}

# Compute the results for a single cluster
if(peak_count == 0){
    # Cluster the data to remove any outliers/extra clusters
    K <- inc_k_means_fast(RP,10^(-4),500,k_max);
    # Get the indices related to the largest cluster
    L_Ind <- Largest_Cluster(K,0.05);
    SIM <- '';
    T_loc <- min_cluster(Release,L_Ind,y_feet,FIRST,LAST,YEAR,SIM,H_or_A);
    print(paste("Cluster Percentage: ",length(L_Ind),"/",N_p," = ",round(100*length(L_Ind)/N_p,digits=1),"%",sep=""));
    print(paste("Cluster Distance: (",T_loc$RP_x,", ",T_loc$RP_y,", ",T_loc$RP_z,")",sep=""));
    print(paste("Cluster Angle: ",T_loc$RP_a,sep=""));
    print(paste("Cluster Area: ",round(T_loc$RP_area,digits=3),sep=""));
}

###################
# RESET DIRECTORY #
###################

setwd("~");