Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # A class to partition input points using the k-means algorithm
- #
- # Use it like this:
- #
- # points = [[0,0],[0,1],[0,100],[0,104]]
- # result = KMeans.calculate(points, 2))
- # # result is [[0,0.5],[0,102]]
- class KMeans
- def self.calculate(points, num_means)
- means = generate_initial_means(points, num_means)
- old_means = []
- until means == old_means do
- old_means = means
- means = calculate_new_means(means, points)
- end
- means
- end
- def self.generate_initial_means(points, num_means)
- points.sample(num_means)
- end
- def self.calculate_new_means(current_means, points)
- partitions = group_points_by_closest_mean(current_means, points)
- new_means = partitions.map(&method(:calculate_center))
- new_means.sort
- end
- def self.group_points_by_closest_mean(current_means, points)
- map = Hash.new{|h, k| h[k] = []}
- points.each do |point|
- closest = find_closest_mean(current_means, point)
- map[closest] << point
- end
- map.values
- end
- def self.calculate_center(points)
- dimensions = points.transpose
- averages = dimensions.collect do |dimension|
- sum = dimension.reduce(:+)
- sum.to_f / dimension.length
- end
- averages
- end
- def self.distance_squared(a, b)
- a.zip(b)
- .map{|ai, bi| (ai - bi)**2}
- .reduce(:+)
- end
- def self.find_closest_mean(means, point)
- distances = means.map { |mean| distance_squared(mean, point) }
- # if there are duplicate means, each_with_index.min will always
- # return the one with the lowest index. That's as good a solution
- # as any in case of duplication.
- _min, index = distances.each_with_index.min
- means[index]
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement