Don't like ads? PRO users don't see any ads ;-)
Guest

Untitled

By: a guest on Jun 9th, 2012  |  syntax: None  |  size: 4.69 KB  |  hits: 17  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. # Copyright Mathieu Blondel December 2011
  2.  
  3. import numpy as np
  4. import pylab as pl
  5.  
  6. from sklearn.base import BaseEstimator
  7. from sklearn.utils import check_random_state
  8. from sklearn.cluster import MiniBatchKMeans
  9. from sklearn.cluster import KMeans as KMeansGood
  10. from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
  11. from sklearn.datasets.samples_generator import make_blobs
  12.  
  13. ##############################################################################
  14. # Generate sample data
  15. np.random.seed(0)
  16.  
  17. batch_size = 45
  18. centers = [[1, 1], [-1, -1], [1, -1]]
  19. n_clusters = len(centers)
  20. X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.3)
  21.  
  22. class KMeans(BaseEstimator):
  23.  
  24.     def __init__(self, k, max_iter=100, random_state=0, tol=1e-4):
  25.         self.k = k
  26.         self.max_iter = max_iter
  27.         self.random_state = random_state
  28.         self.tol = tol
  29.  
  30.     def _e_step(self, X):
  31.         self.labels_ = euclidean_distances(X, self.cluster_centers_,
  32.                                      squared=True).argmin(axis=1)
  33.  
  34.     def _average(self, X):
  35.         return X.mean(axis=0)
  36.  
  37.     def _m_step(self, X):
  38.         X_center = None
  39.         for center_id in range(self.k):
  40.             center_mask = self.labels_ == center_id
  41.             if not np.any(center_mask):
  42.                 # The centroid of empty clusters is set to the center of
  43.                 # everything
  44.                 if X_center is None:
  45.                     X_center = self._average(X)
  46.                 self.cluster_centers_[center_id] = X_center
  47.             else:
  48.                 self.cluster_centers_[center_id] = \
  49.                     self._average(X[center_mask])
  50.  
  51.     def fit(self, X, y=None):
  52.         n_samples = X.shape[0]
  53.         vdata = np.mean(np.var(X, 0))
  54.  
  55.         random_state = check_random_state(self.random_state)
  56.         self.labels_ = random_state.permutation(n_samples)[:self.k]
  57.         self.cluster_centers_ = X[self.labels_]
  58.  
  59.         for i in xrange(self.max_iter):
  60.             centers_old = self.cluster_centers_.copy()
  61.  
  62.             self._e_step(X)
  63.             self._m_step(X)
  64.  
  65.             if np.sum((centers_old - self.cluster_centers_) ** 2) < self.tol * vdata:
  66.                 break
  67.  
  68.         return self
  69.  
  70. class KMedians(KMeans):
  71.  
  72.     def _e_step(self, X):
  73.         self.labels_ = manhattan_distances(X, self.cluster_centers_).argmin(axis=1)
  74.  
  75.     def _average(self, X):
  76.         return np.median(X, axis=0)
  77.  
  78. class FuzzyKMeans(KMeans):
  79.  
  80.     def __init__(self, k, m=2, max_iter=100, random_state=0, tol=1e-4):
  81.         """
  82.         m > 1: fuzzy-ness parameter
  83.         The closer to m is to 1, the closter to hard kmeans.
  84.         The bigger m, the fuzzier (converge to the global cluster).
  85.         """
  86.         self.k = k
  87.         assert m > 1
  88.         self.m = m
  89.         self.max_iter = max_iter
  90.         self.random_state = random_state
  91.         self.tol = tol
  92.  
  93.     def _e_step(self, X):
  94.         D = 1.0 / euclidean_distances(X, self.cluster_centers_, squared=True)
  95.         D **= 1.0 / (self.m - 1)
  96.         D /= np.sum(D, axis=1)[:, np.newaxis]
  97.         # shape: n_samples x k
  98.         self.fuzzy_labels_ = D
  99.         self.labels_ = self.fuzzy_labels_.argmax(axis=1)
  100.  
  101.     def _m_step(self, X):
  102.         weights = self.fuzzy_labels_ ** self.m
  103.         # shape: n_clusters x n_features
  104.         self.cluster_centers_ = np.dot(X.T, weights).T
  105.         self.cluster_centers_ /= weights.sum(axis=0)[:, np.newaxis]
  106.  
  107.     def fit(self, X, y=None):
  108.         n_samples, n_features = X.shape
  109.         vdata = np.mean(np.var(X, 0))
  110.  
  111.         random_state = check_random_state(self.random_state)
  112.         self.fuzzy_labels_ = random_state.rand(n_samples, self.k)
  113.         self.fuzzy_labels_ /= self.fuzzy_labels_.sum(axis=1)[:, np.newaxis]
  114.         self._m_step(X)
  115.  
  116.         for i in xrange(self.max_iter):
  117.             centers_old = self.cluster_centers_.copy()
  118.  
  119.             self._e_step(X)
  120.             self._m_step(X)
  121.  
  122.             if np.sum((centers_old - self.cluster_centers_) ** 2) < self.tol * vdata:
  123.                 break
  124.  
  125.         return self
  126.  
  127.  
  128. kmeans = KMeans(k=3)
  129. kmeans.fit(X)
  130.  
  131. kmedians = KMedians(k=3)
  132. kmedians.fit(X)
  133.  
  134. fuzzy_kmeans = FuzzyKMeans(k=3, m=2)
  135. fuzzy_kmeans.fit(X)
  136.  
  137. fig = pl.figure()
  138. colors = ['#4EACC5', '#FF9C34', '#4E9A06']
  139.  
  140. objects = (kmeans, kmedians, fuzzy_kmeans)
  141.  
  142. for i, obj in enumerate(objects):
  143.     ax = fig.add_subplot(1, len(objects), i + 1)
  144.     for k, col in zip(range(obj.k), colors):
  145.         my_members = obj.labels_ == k
  146.         cluster_center = obj.cluster_centers_[k]
  147.         ax.plot(X[my_members, 0], X[my_members, 1], 'w',
  148.                 markerfacecolor=col, marker='.')
  149.         ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
  150.                                         markeredgecolor='k', markersize=6)
  151.     ax.set_title(obj.__class__.__name__)
  152.  
  153. pl.show()