Guest User

Untitled

a guest
Oct 24th, 2018
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.77 KB | None | 0 0
  1. import pandas
  2. import numpy
  3. import pickle
  4. import math
  5. from sklearn.cluster import MiniBatchKMeans, KMeans
  6. from matplotlib import pyplot as plt
  7. from sklearn.metrics import silhouette_score
  8. from pas.algorithm.som_with_tensorflow import SOM
  9.  
  10.  
  11. class ClusterWithSomNKmeans(object):
  12.  
  13. def __init__(self):
  14. return
  15.  
  16. def run_som( self, _n, _m, _dim_features, _nitr, _alpha, _nparray ):
  17. som_object = SOM(_n,
  18. _m,
  19. _dim_features,
  20. _nitr,
  21. _alpha)
  22. som_object.train(_nparray)
  23.  
  24. # Get output grid
  25. image_grid = som_object.get_centroids()
  26.  
  27. # value of winner node
  28. mapped = som_object.map_vects(_nparray)
  29.  
  30. #for index in range(0, len(mapped)):
  31. # print(str(self._feature.get_column_names(index)) + ": " + str(mapped[index]))
  32.  
  33. #self._som_result = som_object
  34. #print(som_object)
  35.  
  36. return som_object
  37.  
  38. def getBestPointInCurve(self, arr_curve):
  39. curve_point = numpy.asarray([i for i in range(0, len(arr_curve))])
  40. nPoints = len(arr_curve)
  41. allCoord = numpy.vstack((range(nPoints), arr_curve)).T
  42. numpy.array([range(nPoints), arr_curve])
  43. firstPoint = allCoord[0]
  44. lineVec = allCoord[-1] - allCoord[0]
  45. lineVecNorm = lineVec / numpy.sqrt(numpy.sum(lineVec ** 2))
  46. vecFromFirst = allCoord - firstPoint
  47. scalarProduct = numpy.sum(vecFromFirst * numpy.matlib.repmat(lineVecNorm, nPoints, 1), axis=1)
  48. vecFromFirstParallel = numpy.outer(scalarProduct, lineVecNorm)
  49. vecToLine = vecFromFirst - vecFromFirstParallel
  50. distToLine = numpy.sqrt(numpy.sum(vecToLine ** 2, axis=1))
  51. idxOfBestPoint = numpy.argmax(distToLine)
  52. return idxOfBestPoint
  53.  
  54. def findOptimalNumberOfGroup(self, X, min_number_of_group, max_number_of_group, is_plot=False):
  55. # Clustering by k-means
  56. # Get optimal number of group
  57. arr_sil_coeff = []
  58. arr_sil_coeff_k = []
  59. arr_elbow = []
  60.  
  61. for n_test_cluster in range(min_number_of_group, max_number_of_group):
  62. #print("K-Means for find optimal number of groups - " + str(n_test_cluster))
  63. algorithm = MiniBatchKMeans(init='k-means++', n_clusters=n_test_cluster, batch_size=100)
  64.  
  65. test_elbow = algorithm.fit(X).score(X)
  66. test_labels = algorithm.labels_
  67. test_sil_coeff = silhouette_score(X, test_labels, metric='euclidean')
  68.  
  69. arr_sil_coeff.append(test_sil_coeff)
  70. arr_elbow.append(test_elbow)
  71. arr_sil_coeff_k.append(n_test_cluster)
  72.  
  73. #print("n_cluster = " + str(n_test_cluster) + " - sil_coeff = " + str(test_sil_coeff) + " - elbow = " + str(
  74. # test_elbow))
  75.  
  76. arr_sil_coeff_k = numpy.asarray(arr_sil_coeff_k)
  77. idx_best_sil = self.getBestPointInCurve(numpy.asarray(arr_sil_coeff))
  78. idx_best_elbow = self.getBestPointInCurve(numpy.asarray(arr_elbow))
  79.  
  80. print("optimal k: sil_coeff -> " + str(arr_sil_coeff_k[idx_best_sil]) + ", elbow -> " + str(
  81. arr_sil_coeff_k[idx_best_elbow]))
  82.  
  83. if is_plot:
  84. plt.plot(arr_sil_coeff)
  85. plt.xticks(arr_sil_coeff_k)
  86. plt.title("Sil_Coeff")
  87. plt.show()
  88.  
  89. plt.plot(arr_elbow)
  90. plt.xticks(arr_sil_coeff_k)
  91. plt.title("Elbow")
  92. plt.show()
  93.  
  94. return arr_sil_coeff_k[idx_best_sil], arr_sil_coeff_k[idx_best_elbow]
  95.  
  96. def make_cluster_with_kmeans(self,_som_object,_n_som,_m_som,_dim_som_feature,_nparray):
  97. assert (len( _nparray ) > 0)
  98. assert (_dim_som_feature == len( _nparray[0] ))
  99.  
  100. # Find Optimal Number of Groups using MiniBatchKMeans
  101. centroids = _som_object.get_centroids()
  102. centroids = numpy.array( centroids )
  103. centroids.shape = ( _n_som * _m_som, _dim_som_feature)
  104. # linear_clusters = hdbscan.HDBSCAN( min_cluster_size=5 ).fit_predict( centroids )
  105.  
  106. #clustering_algorithm
  107. max_number_of_group = len(_nparray)
  108. som_sil, som_elbow = self.findOptimalNumberOfGroup( centroids, 2, max_number_of_group, is_plot=False )
  109. print( "SOM Sil: " + str( som_sil ) + " - SOM Elbow: " + str( som_elbow ) )
  110. n_cluster = min( som_sil, som_elbow )
  111.  
  112. # Input Data for Clustering
  113. #X = numpy.array( self._feature._listoflist )
  114. # X.shape = (len(self._feature._listoflist), _dim_of_som_feature)
  115. X = _nparray
  116. X.shape = (len(_nparray), _dim_som_feature)
  117.  
  118. # Make cluster with MiniBatchKMeans.
  119. clustering_algorithm = MiniBatchKMeans( init='k-means++', n_clusters=n_cluster, batch_size=100 )
  120. # clustering_algorithm = KMeans( n_clusters=self._number_of_cluster, init='k-means++', random_state=10 )
  121. clustering_algorithm.fit( X )
  122.  
  123. # clustering_algorithm = hdbscan.HDBSCAN( int(self._number_of_cluster) )
  124. # linear_result = clustering_algorithm.fit_predict( X )
  125. # print("The hdbscan result")
  126. # print(linear_result)
  127.  
  128. # print(self._cluster.cluster_centers_)
  129.  
  130. return clustering_algorithm
  131.  
  132. # input :
  133. # _x : input vectors
  134. # _clustering_algorithm : clustring algorith such as KMeans
  135. # returns : a list of three values, label, distance, and similarity
  136. """
  137. def get_relationships_vector_and_clusters(self, _x, _clustering_algorithm):
  138. labels = _clustering_algorithm.predict( _x )
  139. label = labels[0]
  140.  
  141. result = []
  142. center = _clustering_algorithm.cluster_centers_[ label ]
  143. distance = numpy.linalg.norm( _x - center )
  144. similarity = numpy.inner( _x, center ) \
  145. / (numpy.linalg.norm( _x ) * numpy.linalg.norm( center ))
  146.  
  147. return result
  148. """
  149.  
  150. # _nparray : 2D array
  151. def get_relationships_2DArray_and_clusters( self, _nparray , _clustering_algorithm):
  152. labels = _clustering_algorithm.predict(_nparray)
  153. assert( len(_nparray) == len(labels) )
  154.  
  155. result = [] #numpy.asarray( [i for i in range( 0, 3 )] )
  156. index = 0
  157. # for vector_x, vector_y in zip( nparray, self._cluster.cluster_centers_ ):
  158. center_list = [] #numpy.asarray( [i for i in range( 0, len(nparray) )])
  159. for i in range( 0, len(_nparray) ):
  160. center_list.append( _clustering_algorithm.cluster_centers_[ labels[i] ] )
  161. center_list = numpy.asarray(center_list)
  162.  
  163. for vector in _nparray:
  164. distance = numpy.linalg.norm( vector - center_list[index] )
  165. similarity = numpy.inner( vector, center_list[index] ) \
  166. / (numpy.linalg.norm( vector ) * numpy.linalg.norm( center_list[index] ))
  167. label = labels[index]
  168. # returned feature
  169. a_list = [label, distance, similarity]
  170. result.append( a_list )
  171. index = index + 1
  172.  
  173. return result
Add Comment
Please, Sign In to add comment