Advertisement
Guest User

kmeans

a guest
May 29th, 2016
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.33 KB | None | 0 0
  1. import sys
  2. import math
  3. import random
  4. import numpy as np
  5. from scipy.spatial import distance
  6.  
  7. class Coordinate:
  8.     def __init__(self,data):
  9.         self.data = data
  10.         self.length = len(data)
  11.  
  12. class Cluster:
  13.     def __init__(self,coordinates):
  14.         self.points = coordinates
  15.         # All the points have the same dimensionality, we can just use one point:
  16.         self.length = coordinates[0].length
  17.         self.center = self.calcCenter()
  18.    
  19.     def calcCenter(self):
  20.         all_points = len(self.points)
  21.         all_coordinates = [x.data for x in self.points]
  22.         all_data_aligned = zip(*all_coordinates)
  23.         true_center = [math.fsum(dList)/all_points for dList in all_data_aligned]
  24.         return Coordinate(true_center)
  25.    
  26.     def improve(self,coordinates):
  27.         bad_center = self.center
  28.         self.points = coordinates
  29.         self.center = self.calcCenter()
  30.         improvement = euclid(bad_center,self.center)
  31.         return improvement
  32.        
  33.  
  34. def euclid(point1, point2):
  35.     return distance.euclidean(point1.data,point2.data)
  36.  
  37.    
  38. def sse(cluster):
  39.     sse = 0.0
  40.     for x,y in enumerate(cluster):
  41.         for count in y.points:
  42.             sse += pow(euclid(cluster[x].center,count),2)
  43.     return sse 
  44.        
  45. def kmeans(data, k, threshold):
  46.     initpoints = random.sample(data,k)
  47.     clusters = [Cluster([p]) for p in initpoints]
  48.     while True:
  49.         all_lists = [ [] for c in clusters]
  50.         num_clusters = len(clusters)
  51.         for count in data:
  52.             best_distance = euclid(count,clusters[0].center)
  53.             curr_index = 0
  54.             for i in range(num_clusters -1):
  55.                 point_distance = euclid(count,clusters[i+1].center)
  56.                 if point_distance < best_distance:
  57.                     best_distance = point_distance
  58.                     curr_index = i + 1
  59.             all_lists[curr_index].append(count)
  60.         best_improvement = 0.0
  61.         for y in range(num_clusters):
  62.             improvement = clusters[y].improve(all_lists[y])
  63.             best_improvement = max(best_improvement,improvement)
  64.             sse_test = sse(clusters)
  65.             print str(y) + "," +  str(sse_test)
  66.         if best_improvement < threshold:
  67.             print "System converged!"
  68.             break
  69.     return clusters
  70.                
  71. dataset = np.loadtxt("cluster-data-for-k-means.csv",delimiter=",")
  72. dataset = dataset.tolist()
  73. format_data = []
  74. for count in range(0,len(dataset)):
  75.     new_point = Coordinate(dataset[count])
  76.     format_data.append(new_point)
  77. for k in range(1,4):
  78.     result = kmeans(format_data,k,0.0001)
  79.     calc_error = sse(result)
  80.     print "With K value " + str(k) + ", SSE: " + str(calc_error)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement