Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2017
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.06 KB | None | 0 0
  1. import theano
  2. import theano.tensor as T
  3. import numpy as np
  4.  
  5.  
  6. def asfloat(value):
  7. """ Convert variable to float type configured by theano
  8. floatX variable.
  9.  
  10. Parameters
  11. ----------
  12. value : matrix, ndarray or scalar
  13. Value that could be converted to float type.
  14.  
  15. Returns
  16. -------
  17. matrix, ndarray or scalar
  18. Output would be input value converted to float type
  19. configured by theano floatX variable.
  20. """
  21.  
  22. if isinstance(value, (np.matrix, np.ndarray)):
  23. return value.astype(theano.config.floatX)
  24.  
  25. float_x_type = np.cast[theano.config.floatX]
  26. return float_x_type(value)
  27.  
  28.  
  29. def clone(instance):
  30. class_ = instance.__class__
  31. parameters = instance.get_params()
  32. return class_(**parameters)
  33.  
  34.  
  35. class FuzzyCMeans(object):
  36. """
  37. Fuzzy c-means.
  38.  
  39. Paramaters
  40. ----------
  41. n_clusters : int
  42. Number of clusters.
  43. m : float
  44. """
  45. def __init__(self, n_clusters, m=2):
  46. if n_clusters < 2:
  47. raise ValueError("Number of clusters should be greater than 2")
  48.  
  49. if m < 1:
  50. raise ValueError("Parameter `m` should be greater than 1")
  51.  
  52. self.n_clusters = n_clusters
  53. self.m = m
  54.  
  55. self.centers_ = None
  56. self.is_initialized = False
  57.  
  58. def init_methods(self):
  59. if self.is_initialized:
  60. raise AttributeError("Methods have already been initialized.")
  61.  
  62. x = T.matrix('x')
  63. centers = self.centers_
  64.  
  65. d = distance_to_centers = (
  66. x.reshape((x.shape[0], 1, x.shape[1])) -
  67. centers.reshape((1, centers.shape[0], centers.shape[1]))
  68. ).norm(L=2, axis=2)
  69.  
  70. weights = 1 / (
  71. (
  72. distance_to_centers.reshape((d.shape[0], d.shape[1], 1)) /
  73. distance_to_centers.reshape((d.shape[0], 1, d.shape[1]))
  74. ) ** asfloat(2. / (self.m - 1))
  75. ).sum(axis=2)
  76.  
  77. proba = weights / T.sum(weights, axis=1).reshape((-1, 1))
  78. proba_power_m = weights ** self.m
  79. new_centers = (
  80. proba_power_m.T.dot(x) /
  81. T.sum(proba_power_m, axis=0).reshape((-1, 1))
  82. )
  83.  
  84. self.predict_proba = theano.function([x], proba)
  85. self.train_iteration = theano.function([x], proba, updates=[
  86. (centers, new_centers),
  87. ])
  88.  
  89. self.is_initialized = True
  90.  
  91. def get_params(self, deep=False):
  92. return dict(n_clusters=self.n_clusters, m=self.m)
  93.  
  94. @property
  95. def centers(self):
  96. return self.centers_.get_value()
  97.  
  98. def fit(self, data, maxiter=100, epsilon=1e-5, verbose=False):
  99. n_features = data.shape[1]
  100.  
  101. if self.centers_ is None:
  102. data_min = data.min(axis=0)
  103. data_max = data.max(axis=0)
  104.  
  105. random_centers = np.random.random((self.n_clusters, n_features))
  106. scaled_centers = (data_max - data_min) * random_centers + data_min
  107. self.centers_ = theano.shared(
  108. name='centers',
  109. value=asfloat(scaled_centers)
  110. )
  111.  
  112. n_expected_features = self.centers.shape[1]
  113. if n_expected_features != n_features:
  114. raise ValueError("Input data must contain {} features, "
  115. "found {}".format(n_expected_features,
  116. n_features))
  117.  
  118. if not self.is_initialized:
  119. self.init_methods()
  120.  
  121. i = 1
  122. proba_update = np.inf
  123. prev_proba = None
  124. while (proba_update > epsilon) and (i <= maxiter):
  125. proba = self.train_iteration(data)
  126.  
  127. if prev_proba is not None:
  128. proba_update = np.linalg.norm(prev_proba - proba)
  129.  
  130. prev_proba = proba
  131. i += 1
  132.  
  133. def predict(self, data):
  134. proba = self.predict_proba(data)
  135. return proba.argmax(axis=1)
  136.  
  137.  
  138. def select_best_clustering(algorithm, n_trials, data, **fit_kwargs):
  139. """ Select best clusters using SSE.
  140.  
  141. Parameters
  142. ----------
  143. algorithm : object
  144. n_trials : int
  145. data : matrix
  146.  
  147. Raises
  148. ------
  149. ValueError
  150. Exception will raise in case input parameter values
  151. are invalid.
  152.  
  153. Returns
  154. -------
  155. object
  156. Pretrained clustering algorithm that give smallest
  157. SSE (sum of squared error) score.
  158. """
  159.  
  160. if n_trials < 1:
  161. raise ValueError("Number of trials should be greater than 1")
  162.  
  163. if not isinstance(n_trials, int):
  164. raise ValueError("Number of tirals should be an integer number")
  165.  
  166. algorithms = []
  167. for trial in range(n_trials):
  168. algorithm = clone(algorithm)
  169. algorithm.fit(data, **fit_kwargs)
  170.  
  171. clusters = algorithm.predict(data)
  172. centers = algorithm.centers[clusters, :]
  173. sse_score = np.sum((data - centers) ** 2)
  174.  
  175. # We should use second variable as unique value to prevent
  176. # object instance comparison when we have exactly the same
  177. # score values.
  178. algorithms.append((sse_score, trial, algorithm))
  179.  
  180. _, _, best_algorithm = min(algorithms)
  181. return best_algorithm
  182.  
  183.  
  184. if __name__ == '__main__':
  185. fcm = FuzzyCMeans(n_clusters=2, m=2)
  186. fcm.fit(data, maxiter=100)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement