Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ##########################################################################################
- # inc_k_means_fast.r: This routine performs the incremental k-means clustering algorithm #
- # based on "The Estimating Optimal Number of Gaussian Mixtures on #
- # Incremental k-Means for Speaker Identification" by Younjeong Lee, #
- # Ki Yong Lee, and Joohun Lee (2006). #
- ##########################################################################################
- # Code written and algorithm modified by Matthew Mata, May 2015 (matthewrmata@gmail.com)
- # This version is restricted to two dimensions to allow for optimization of the runtime.
- #############
- # LIBRARIES #
- #############
- library(abind);
- ###############################################################
- # MAIN FUNCTION DECLARATION: INCREMENTAL K-MEANS EM ALGORITHM #
- ###############################################################
- # 'Pitches' is a list which has the 'x' and 'z' coordinates of the PITCHf/x data
- # 'tol' is the log-likelihood tolerance (e.g., 10^(-10))
- # 'iter_max' is the maximum number of iterations for the EM algorithm (e.g., 1000)
- # 'k_max' is the maximum number of clusters to look for (e.g., 10)
- # The output is the number of clusters, M, probabilities, w, means, mu, and covariances, sig
- inc_k_means_fast <- function(Pitches,tol,iter_max,k_max){
- # Assign the x- and z-data to vectors
- x_coord <- Pitches$x;
- z_coord <- Pitches$z;
- # Find the number of pitches
- T <- length(x_coord);
- # Build the array
- X <- array(0,dim=c(T,2));
- X[,1] <- x_coord;
- X[,2] <- z_coord;
- # Initialize the count of mixtures
- k <- 1;
- # Set up empty blocks for mu and SIG
- w_block <- array(1,dim=c(1,1));
- mu_block <- array(0,c(1,2));
- SIG_block_temp <- diag(2);
- SIG_block <- array(0,c(2,2,1));
- SIG_block[,,1] <- SIG_block_temp;
- C_block <- array(1,dim=c(1,T));
- phi_block <- array(0,dim=1);
- # Allocate an array for the possible centroids and means
- E <- array(0,dim=T);
- w <- w_block;
- mu <- mu_block;
- SIG <- SIG_block;
- C <- C_block;
- phi <- phi_block;
- # Calculate the first centroid
- for (n in 1:T) {
- E[n] <- sum(exp(-rowSums(sweep(X,2,X[n,],"-")^2)));
- }
- # Find the index corresponding to the minimum of E
- n_star <- which(E == max(E));
- # Assign the centroid to mu
- mu[1,] = X[n_star,];
- # Compute the covariance matrix
- X_temp <- sweep(X,2,mu[1,],"-")
- SIG[,,1] <- t(X_temp)%*%X_temp;
- SIG[,,1] <- SIG[,,1]/(T-1);
- # Save out the data to new arrays
- while(max(phi[c(1:(k-1))]) <= 0 && k < k_max){
- # Update the parameters
- w_old <- w;
- mu_old <- mu;
- SIG_old <- SIG;
- C_old <- C;
- # Add a new cluster
- k <- k+1;
- # Add an extra column to the w, mu, SIG, C, phi, and total_temp
- w <- cbind(w,w_block);
- mu <- rbind(mu,mu_block);
- SIG <- abind(SIG,SIG_block);
- C <- rbind(C,C_block);
- phi <- rbind(phi,phi_block);
- # Zero out E
- E <- 0;
- Total_Array <- array(0,dim=c(T,k-1));
- # Calculate the next centroid
- for(m in 1:(k-1)){
- Total_Array[,m] <- rowSums(sweep(X,2,mu[m,],"-")^2);
- }
- for(n in 1:T){
- total <- 0;
- for(t in 1:T){
- total <- total + min(c(Total_Array[t,],sum((X[t,] - X[n,])^2)));
- }
- E[n] <- total;
- }
- n_star <- which(E == min(E));
- mu[k,] = X[n_star,];
- # To start EM, compute the initial log likelihood
- Q_old <- 0;
- Q_new <- 0;
- # Set the iterations counter
- iter <- 0;
- # Use a while loop to determine when to end EM
- while( (abs(Q_new-Q_old) > tol && iter < iter_max) || iter == 0 ){
- # Expectation Step
- C <- E_Step(X,w,mu,SIG,k,T);
- # Maximization Step
- w <- M_Step_Mix(C,T);
- SIG <- M_Step_Var(X,mu,C,k);
- # Compute the log likelihood and update the Q-values
- Q_old <- Q_new;
- Q_new <- Q_theta(X,w,mu,SIG,C,k);
- iter <- iter + 1;
- }
- # Calculate the probabilities for testing independence of clusters
- phi <- phi_ik(C,k,T);
- }
- # If the max iterations is reached, output the values. Otherwise, output the results when the algorithm ends
- if(k == k_max){
- # Turn the output into a list
- Out <- list("M" = k, "w" = w, "mu" = mu, "SIG" = SIG, "C" = C);
- }
- else{
- # Set M, the number of clusters, equal to k-1
- M = k-1;
- # Turn the output into a list
- Out <- list("M" = M, "w" = w_old, "mu" = mu_old, "SIG" = SIG_old, "C" = C_old);
- }
- return(Out);
- }
- ########################
- # ADDITIONAL FUNCTIONS #
- ########################
- # Expectation Step
- E_Step <- function(X,w,mu,SIG,K,T){
- C <- array(0,dim=c(K,T));
- for(i in 1:K){
- Xm <- sweep(X,2,mu[i,],"-");
- SiXm <- solve(SIG[,,i],t(Xm));
- C[i,] <- (w[i]/((2*pi)*sqrt(det(SIG[,,i]))))*exp(-0.5*rowSums(Xm*t(SiXm)));
- }
- C_sum <- colSums(C);
- C <- sweep(C,2,C_sum,"/");
- return(C);
- }
- # Log Likelihood
- Q_theta <- function(X,w,mu,SIG,C,K){
- Q <- 0;
- for(i in 1:K){
- Xm <- sweep(X,2,mu[i,],"-");
- SiXm <- solve(SIG[,,i],t(Xm));
- Q_temp <- C[i,]%*%(log(w[i]) - 0.5*log(det(SIG[,,i])) -
- 0.5*rowSums(Xm*t(SiXm)) - log(2*pi));
- Q <- Q + Q_temp;
- }
- return(Q);
- }
- # Mixture Weights
- M_Step_Mix <- function(C,T){
- w <- (1/T)*t(rowSums(C));
- return(w);
- }
- # Variances
- M_Step_Var <- function(X,mu,C,K){
- SIG <- array(0,dim=c(2,2,K));
- BOT <- rowSums(C);
- for (i in 1:K){
- X_temp <- sweep(X,2,mu[i,],"-");
- TOP <- t(sweep(X_temp,1,C[i,],"*"))%*%X_temp;
- SIG[,,i] <- TOP/BOT[i];
- }
- return(SIG);
- }
- # Mixture Probabilities
- phi_ik <- function(C,K,T){
- p_i <- (1/T)*rowSums(C);
- p_ik <- (1/T)*rowSums(sweep(C,2,C[K,],"*"));
- phi <- p_ik*log(p_ik/(p_i*p_i[K]))/log(2);
- return(phi);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement