Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- function [mu, C, iters, err] = fkmeansMA(X, k)
- % find the centers of k clusters in the set of data X
- % X is an m x n matrix of n, m-dimensional points
- % k is the number of desired centers for the data
- % mu is the coordinates of the centers of each cluster
- % C is a vector, where each point in X, i is in cluster C(i)
- % iters is the number of iterations required to converge on mu
- fuzz = 3.75;
- [m n] = size(X);
- mn = min(min(X));
- mx = max(max(X));
- % initial guesses are random numbers between the minimum
- % and maximum values in X. This is probably the part that would most
- % benefit from a slightly more intelligent method.
- mu = randn(m,k)*(mx - mn) + mn;
- % initialize probability matrices
- % probs is a k by n matrix, where each row i represents cluster i
- % and each column j represents X(i)
- prev = zeros(k,n);
- probs = ones(k,n);
- % count how many iterations it takes to converge
- % we restrict this to 100 iterations so it doesn't get stuck
- iters = 0;
- while ~isGoodEnough(probs, prev) && iters < 100
- prev = probs; % store the probabilities calculated in the last run
- % find the probabilities that each point i is in each cluster
- for i = 1:n
- dists = calcDistances(X(:,i), mu);
- probs(:,i) = clusterProb(dists);
- end
- % update our guess for each center by taking the weighted average
- % of all of the points, using the previously calculated
- % probabilities
- for i = 1:k
- P = probs(i,:);
- mu(:,i) = weightedAverage(X, P, fuzz);
- end
- iters = iters + 1;
- end
- [i, C] = max(probs);
- err = 0;
- for i = 1:k
- for j = X(C==k)
- err = err + dist(j,mu(:,i)).^2;
- end
- end
- end
- function avg = weightedAverage(points, probs, fuzz)
- % calculate the weighted averages
- % points is a matrix of all the points
- % probabilities is the probability that each point is in cluster i
- [m n] = size(points);
- avg = zeros(m,1);
- div = 0;
- % weighted average formula taken from clustering article on wikipedia
- for j = 1:n
- avg = avg + (probs(j).^fuzz * points(:,j));
- div = div + probs(j).^fuzz;
- end
- avg = avg/div;
- end
- function dists = calcDistances(point, centers)
- % calculate the distances of point from each of the centers
- [m, k] = size(centers);
- dists = zeros(1,k);
- for j = 1:k
- dists(j) = dist(point, centers(:,j));
- end
- end
- function good = isGoodEnough(probs, prev)
- % tests for convergence
- % we claim that we've converged when the difference between two iterations
- % of kmeans is less than iota
- iota = .001;
- difference = abs(probs - prev);
- good = all(all(difference < iota));
- end
- function probs = clusterProb(dists)
- % calculate the probability that a point is in a given cluster based on its
- % distance from that cluster. We use the reciprocal of the distances since
- % smaller numbers are better
- invs = 1 ./ dists;
- tot = sum(invs);
- probs = invs ./ tot;
- end
- function r = dist(x,y)
- % calculate the distance between x and y.
- % NOTE: doesn't actually calculate the distance. we only care about
- % relative values, and taking the square root here will be slower, but
- % won't change minimum distances
- r = sqrt(sum((x-y).^2));
- end
Add Comment
Please, Sign In to add comment