Advertisement
matthewrmata

2D Clustering Algorithm

Jun 3rd, 2015
220
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 5.56 KB | None | 0 0
  1. ##########################################################################################
  2. # inc_k_means_fast.r: This routine performs the incremental k-means clustering algorithm #
  3. #                     based on "The Estimating Optimal Number of Gaussian Mixtures on    #
  4. #                     Incremental k-Means for Speaker Identification" by Younjeong Lee,  #
  5. #                     Ki Yong Lee, and Joohun Lee (2006).                                #
  6. ##########################################################################################
  7.  
  8. # Code written and algorithm modified by Matthew Mata, May 2015 (matthewrmata@gmail.com)
  9.  
  10. # This version is restricted to two dimensions to allow for optimization of the runtime.
  11.  
  12. #############
  13. # LIBRARIES #
  14. #############
  15.  
  16. library(abind);
  17.  
  18. ###############################################################
  19. # MAIN FUNCTION DECLARATION: INCREMENTAL K-MEANS EM ALGORITHM #
  20. ###############################################################
  21.  
  22. # 'Pitches' is a list which has the 'x' and 'z' coordinates of the PITCHf/x data
  23. # 'tol' is the log-likelihood tolerance (e.g., 10^(-10))
  24. # 'iter_max' is the maximum number of iterations for the EM algorithm (e.g., 1000)
  25. # 'k_max' is the maximum number of clusters to look for (e.g., 10)
  26. # The output is the number of clusters, M, probabilities, w, means, mu, and covariances, sig
  27.  
  28. inc_k_means_fast <- function(Pitches,tol,iter_max,k_max){
  29.     # Assign the x- and z-data to vectors
  30.     x_coord <- Pitches$x;
  31.     z_coord <- Pitches$z;
  32.     # Find the number of pitches
  33.     T <- length(x_coord);
  34.     # Build the array
  35.     X <- array(0,dim=c(T,2));
  36.     X[,1] <- x_coord;
  37.     X[,2] <- z_coord;
  38.     # Initialize the count of mixtures
  39.     k <- 1;
  40.     # Set up empty blocks for mu and SIG
  41.     w_block <- array(1,dim=c(1,1));
  42.     mu_block <- array(0,c(1,2));
  43.     SIG_block_temp <- diag(2);
  44.     SIG_block <- array(0,c(2,2,1));
  45.     SIG_block[,,1] <- SIG_block_temp;
  46.     C_block <- array(1,dim=c(1,T));
  47.     phi_block <- array(0,dim=1);
  48.     # Allocate an array for the possible centroids and means
  49.     E <- array(0,dim=T);
  50.     w <- w_block;
  51.     mu <- mu_block;
  52.     SIG <- SIG_block;
  53.     C <- C_block;
  54.     phi <- phi_block;
  55.     # Calculate the first centroid
  56.     for (n in 1:T) {
  57.         E[n] <- sum(exp(-rowSums(sweep(X,2,X[n,],"-")^2)));
  58.     }
  59.     # Find the index corresponding to the minimum of E
  60.     n_star <- which(E == max(E));
  61.     # Assign the centroid to mu
  62.     mu[1,] = X[n_star,];
  63.     # Compute the covariance matrix
  64.     X_temp <- sweep(X,2,mu[1,],"-")
  65.     SIG[,,1] <- t(X_temp)%*%X_temp;
  66.     SIG[,,1] <- SIG[,,1]/(T-1);
  67.     # Save out the data to new arrays
  68.     while(max(phi[c(1:(k-1))]) <= 0 && k < k_max){
  69.         # Update the parameters
  70.         w_old <- w;
  71.         mu_old <- mu;
  72.         SIG_old <- SIG;
  73.         C_old <- C;
  74.         # Add a new cluster
  75.         k <- k+1;
  76.         # Add an extra column to the w, mu, SIG, C, phi, and total_temp
  77.         w <- cbind(w,w_block);
  78.         mu <- rbind(mu,mu_block);
  79.         SIG <- abind(SIG,SIG_block);
  80.         C <- rbind(C,C_block);
  81.         phi <- rbind(phi,phi_block);
  82.         # Zero out E
  83.         E <- 0;
  84.         Total_Array <- array(0,dim=c(T,k-1));
  85.         # Calculate the next centroid
  86.         for(m in 1:(k-1)){
  87.             Total_Array[,m] <- rowSums(sweep(X,2,mu[m,],"-")^2);
  88.         }
  89.         for(n in 1:T){
  90.             total <- 0;
  91.             for(t in 1:T){
  92.                 total <- total + min(c(Total_Array[t,],sum((X[t,] - X[n,])^2)));
  93.             }
  94.             E[n] <- total;
  95.         }
  96.         n_star <- which(E == min(E));
  97.         mu[k,] = X[n_star,];
  98.         # To start EM, compute the initial log likelihood
  99.         Q_old <- 0;
  100.         Q_new <- 0;
  101.         # Set the iterations counter
  102.         iter <- 0;
  103.         # Use a while loop to determine when to end EM
  104.         while( (abs(Q_new-Q_old) > tol && iter < iter_max) || iter == 0 ){
  105.             # Expectation Step
  106.             C <- E_Step(X,w,mu,SIG,k,T);
  107.             # Maximization Step
  108.             w <- M_Step_Mix(C,T);
  109.             SIG <- M_Step_Var(X,mu,C,k);
  110.             # Compute the log likelihood and update the Q-values
  111.             Q_old <- Q_new;
  112.             Q_new <- Q_theta(X,w,mu,SIG,C,k);
  113.             iter <- iter + 1;
  114.         }
  115.         # Calculate the probabilities for testing independence of clusters
  116.         phi <- phi_ik(C,k,T);
  117.     }
  118.    
  119.     # If the max iterations is reached, output the values. Otherwise, output the results when the algorithm ends
  120.     if(k == k_max){
  121.         # Turn the output into a list
  122.         Out <- list("M" = k, "w" = w, "mu" = mu, "SIG" = SIG, "C" = C);
  123.     }
  124.     else{
  125.         # Set M, the number of clusters, equal to k-1
  126.         M = k-1;
  127.         # Turn the output into a list
  128.         Out <- list("M" = M, "w" = w_old, "mu" = mu_old, "SIG" = SIG_old, "C" = C_old);
  129.     }
  130.     return(Out);
  131. }
  132.  
  133. ########################
  134. # ADDITIONAL FUNCTIONS #
  135. ########################
  136.  
  137. # Expectation Step
  138.  
  139. E_Step <- function(X,w,mu,SIG,K,T){
  140.     C <- array(0,dim=c(K,T));
  141.     for(i in 1:K){
  142.         Xm <- sweep(X,2,mu[i,],"-");
  143.         SiXm <- solve(SIG[,,i],t(Xm));
  144.         C[i,] <- (w[i]/((2*pi)*sqrt(det(SIG[,,i]))))*exp(-0.5*rowSums(Xm*t(SiXm)));
  145.     }
  146.     C_sum <- colSums(C);
  147.     C <- sweep(C,2,C_sum,"/");
  148.     return(C);
  149. }      
  150.  
  151. # Log Likelihood
  152.  
  153. Q_theta <- function(X,w,mu,SIG,C,K){
  154.     Q <- 0;
  155.     for(i in 1:K){
  156.         Xm <- sweep(X,2,mu[i,],"-");
  157.         SiXm <- solve(SIG[,,i],t(Xm));
  158.         Q_temp <- C[i,]%*%(log(w[i]) - 0.5*log(det(SIG[,,i])) -
  159.                   0.5*rowSums(Xm*t(SiXm)) - log(2*pi));
  160.         Q <- Q + Q_temp;
  161.     }
  162.     return(Q);
  163. }
  164.  
  165. # Mixture Weights
  166.  
  167. M_Step_Mix <- function(C,T){
  168.     w <- (1/T)*t(rowSums(C));
  169.     return(w);
  170. }
  171.  
  172. # Variances
  173.  
  174. M_Step_Var <- function(X,mu,C,K){
  175.     SIG <- array(0,dim=c(2,2,K));
  176.     BOT <- rowSums(C);
  177.     for (i in 1:K){
  178.         X_temp <- sweep(X,2,mu[i,],"-");
  179.         TOP <- t(sweep(X_temp,1,C[i,],"*"))%*%X_temp;
  180.         SIG[,,i] <- TOP/BOT[i];
  181.     }
  182.     return(SIG);
  183. }
  184.  
  185. # Mixture Probabilities
  186.  
  187. phi_ik <- function(C,K,T){
  188.     p_i <- (1/T)*rowSums(C);
  189.     p_ik <- (1/T)*rowSums(sweep(C,2,C[K,],"*"));
  190.     phi <- p_ik*log(p_ik/(p_i*p_i[K]))/log(2);
  191.     return(phi);
  192. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement