Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- import math
- import random
- import numpy as np
- from scipy.spatial import distance
- class Coordinate:
- def __init__(self,data):
- self.data = data
- self.length = len(data)
- class Cluster:
- def __init__(self,coordinates):
- self.points = coordinates
- # All the points have the same dimensionality, we can just use one point:
- self.length = coordinates[0].length
- self.center = self.calcCenter()
- def calcCenter(self):
- all_points = len(self.points)
- all_coordinates = [x.data for x in self.points]
- all_data_aligned = zip(*all_coordinates)
- true_center = [math.fsum(dList)/all_points for dList in all_data_aligned]
- return Coordinate(true_center)
- def improve(self,coordinates):
- bad_center = self.center
- self.points = coordinates
- self.center = self.calcCenter()
- improvement = euclid(bad_center,self.center)
- return improvement
- def euclid(point1, point2):
- return distance.euclidean(point1.data,point2.data)
- def sse(cluster):
- sse = 0.0
- for x,y in enumerate(cluster):
- for count in y.points:
- sse += pow(euclid(cluster[x].center,count),2)
- return sse
- def kmeans(data, k, threshold):
- initpoints = random.sample(data,k)
- clusters = [Cluster([p]) for p in initpoints]
- while True:
- all_lists = [ [] for c in clusters]
- num_clusters = len(clusters)
- for count in data:
- best_distance = euclid(count,clusters[0].center)
- curr_index = 0
- for i in range(num_clusters -1):
- point_distance = euclid(count,clusters[i+1].center)
- if point_distance < best_distance:
- best_distance = point_distance
- curr_index = i + 1
- all_lists[curr_index].append(count)
- best_improvement = 0.0
- for y in range(num_clusters):
- improvement = clusters[y].improve(all_lists[y])
- best_improvement = max(best_improvement,improvement)
- sse_test = sse(clusters)
- print str(y) + "," + str(sse_test)
- if best_improvement < threshold:
- print "System converged!"
- break
- return clusters
- dataset = np.loadtxt("cluster-data-for-k-means.csv",delimiter=",")
- dataset = dataset.tolist()
- format_data = []
- for count in range(0,len(dataset)):
- new_point = Coordinate(dataset[count])
- format_data.append(new_point)
- for k in range(1,4):
- result = kmeans(format_data,k,0.0001)
- calc_error = sse(result)
- print "With K value " + str(k) + ", SSE: " + str(calc_error)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement