2D Clustering Algorithm

##########################################################################################
# inc_k_means_fast.r: This routine performs the incremental k-means clustering algorithm #
#                     based on "The Estimating Optimal Number of Gaussian Mixtures on    #
#                     Incremental k-Means for Speaker Identification" by Younjeong Lee,  #
#                     Ki Yong Lee, and Joohun Lee (2006).                                #
##########################################################################################

# Code written and algorithm modified by Matthew Mata, May 2015 ([email protected])

# This version is restricted to two dimensions to allow for optimization of the runtime.

#############
# LIBRARIES #
#############

library(abind);

###############################################################
# MAIN FUNCTION DECLARATION: INCREMENTAL K-MEANS EM ALGORITHM #
###############################################################

# 'Pitches' is a list which has the 'x' and 'z' coordinates of the PITCHf/x data
# 'tol' is the log-likelihood tolerance (e.g., 10^(-10))
# 'iter_max' is the maximum number of iterations for the EM algorithm (e.g., 1000)
# 'k_max' is the maximum number of clusters to look for (e.g., 10)
# The output is the number of clusters, M, probabilities, w, means, mu, and covariances, sig

inc_k_means_fast <- function(Pitches,tol,iter_max,k_max){
    # Assign the x- and z-data to vectors
    x_coord <- Pitches$x;
    z_coord <- Pitches$z;
    # Find the number of pitches
    T <- length(x_coord);
    # Build the array
    X <- array(0,dim=c(T,2));
    X[,1] <- x_coord;
    X[,2] <- z_coord;
    # Initialize the count of mixtures
    k <- 1;
    # Set up empty blocks for mu and SIG
    w_block <- array(1,dim=c(1,1));
    mu_block <- array(0,c(1,2));
    SIG_block_temp <- diag(2);
    SIG_block <- array(0,c(2,2,1));
    SIG_block[,,1] <- SIG_block_temp;
    C_block <- array(1,dim=c(1,T));
    phi_block <- array(0,dim=1);
    # Allocate an array for the possible centroids and means
    E <- array(0,dim=T);
    w <- w_block;
    mu <- mu_block;
    SIG <- SIG_block;
    C <- C_block;
    phi <- phi_block;
    # Calculate the first centroid
    for (n in 1:T) {
        E[n] <- sum(exp(-rowSums(sweep(X,2,X[n,],"-")^2)));
    }
    # Find the index corresponding to the minimum of E
    n_star <- which(E == max(E));
    # Assign the centroid to mu
    mu[1,] = X[n_star,];
    # Compute the covariance matrix
    X_temp <- sweep(X,2,mu[1,],"-")
    SIG[,,1] <- t(X_temp)%*%X_temp;
    SIG[,,1] <- SIG[,,1]/(T-1);
    # Save out the data to new arrays
    while(max(phi[c(1:(k-1))]) <= 0 && k < k_max){
        # Update the parameters
        w_old <- w;
        mu_old <- mu;
        SIG_old <- SIG;
        C_old <- C;
        # Add a new cluster
        k <- k+1;
        # Add an extra column to the w, mu, SIG, C, phi, and total_temp
        w <- cbind(w,w_block);
        mu <- rbind(mu,mu_block);
        SIG <- abind(SIG,SIG_block);
        C <- rbind(C,C_block);
        phi <- rbind(phi,phi_block);
        # Zero out E
        E <- 0;
        Total_Array <- array(0,dim=c(T,k-1));
        # Calculate the next centroid
        for(m in 1:(k-1)){
            Total_Array[,m] <- rowSums(sweep(X,2,mu[m,],"-")^2);
        }
        for(n in 1:T){
            total <- 0;
            for(t in 1:T){
                total <- total + min(c(Total_Array[t,],sum((X[t,] - X[n,])^2)));
            }
            E[n] <- total;
        }
        n_star <- which(E == min(E));
        mu[k,] = X[n_star,];
        # To start EM, compute the initial log likelihood
        Q_old <- 0;
        Q_new <- 0;
        # Set the iterations counter
        iter <- 0;
        # Use a while loop to determine when to end EM
        while( (abs(Q_new-Q_old) > tol && iter < iter_max) || iter == 0 ){
            # Expectation Step
            C <- E_Step(X,w,mu,SIG,k,T);
            # Maximization Step
            w <- M_Step_Mix(C,T);
            SIG <- M_Step_Var(X,mu,C,k);
            # Compute the log likelihood and update the Q-values
            Q_old <- Q_new;
            Q_new <- Q_theta(X,w,mu,SIG,C,k);
            iter <- iter + 1;
        }
        # Calculate the probabilities for testing independence of clusters
        phi <- phi_ik(C,k,T);
    }

    # If the max iterations is reached, output the values. Otherwise, output the results when the algorithm ends
    if(k == k_max){
        # Turn the output into a list
        Out <- list("M" = k, "w" = w, "mu" = mu, "SIG" = SIG, "C" = C);
    }
    else{
        # Set M, the number of clusters, equal to k-1
        M = k-1;
        # Turn the output into a list
        Out <- list("M" = M, "w" = w_old, "mu" = mu_old, "SIG" = SIG_old, "C" = C_old);
    }
    return(Out);
}

########################
# ADDITIONAL FUNCTIONS #
########################

# Expectation Step

E_Step <- function(X,w,mu,SIG,K,T){
    C <- array(0,dim=c(K,T));
    for(i in 1:K){
        Xm <- sweep(X,2,mu[i,],"-");
        SiXm <- solve(SIG[,,i],t(Xm));
        C[i,] <- (w[i]/((2*pi)*sqrt(det(SIG[,,i]))))*exp(-0.5*rowSums(Xm*t(SiXm)));
    }
    C_sum <- colSums(C);
    C <- sweep(C,2,C_sum,"/");
    return(C);
}

# Log Likelihood

Q_theta <- function(X,w,mu,SIG,C,K){
    Q <- 0;
    for(i in 1:K){
        Xm <- sweep(X,2,mu[i,],"-");
        SiXm <- solve(SIG[,,i],t(Xm));
        Q_temp <- C[i,]%*%(log(w[i]) - 0.5*log(det(SIG[,,i])) -
                  0.5*rowSums(Xm*t(SiXm)) - log(2*pi));
        Q <- Q + Q_temp;
    }
    return(Q);
}

# Mixture Weights

M_Step_Mix <- function(C,T){
    w <- (1/T)*t(rowSums(C));
    return(w);
}

# Variances

M_Step_Var <- function(X,mu,C,K){
    SIG <- array(0,dim=c(2,2,K));
    BOT <- rowSums(C);
    for (i in 1:K){
        X_temp <- sweep(X,2,mu[i,],"-");
        TOP <- t(sweep(X_temp,1,C[i,],"*"))%*%X_temp;
        SIG[,,i] <- TOP/BOT[i];
    }
    return(SIG);
}

# Mixture Probabilities

phi_ik <- function(C,K,T){
    p_i <- (1/T)*rowSums(C);
    p_ik <- (1/T)*rowSums(sweep(C,2,C[K,],"*"));
    phi <- p_ik*log(p_ik/(p_i*p_i[K]))/log(2);
    return(phi);
}