Advertisement
Guest User

Untitled

a guest
Oct 25th, 2016
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.73 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function, division, absolute_import, unicode_literals
  3. import time
  4. import itertools as it
  5. from sklearn.utils.extmath import row_norms
  6. from sklearn.metrics.pairwise import euclidean_distances
  7. from sklearn.metrics.pairwise import pairwise_distances_argmin_min
  8. import numpy as np
  9.  
  10.  
  11. class Timer(object):
  12. def __init__(self):
  13. self.tstart = -1
  14. self.ellapsed = -1
  15. self.default_timer = time.time
  16.  
  17. def tic(self):
  18. self.tstart = self.default_timer()
  19.  
  20. def toc(self):
  21. ellapsed = (self.default_timer() - self.tstart)
  22. return ellapsed
  23.  
  24. def __enter__(self):
  25. self.tic()
  26. return self
  27.  
  28. def __exit__(self, type_, value, trace):
  29. self.ellapsed = self.toc()
  30. if trace is not None:
  31. return False
  32.  
  33.  
  34. def time_func(func_tup, iters=10):
  35. times = []
  36. func = func_tup[0]
  37. args = func_tup[1:]
  38. for i in range(iters):
  39. with Timer() as t:
  40. func(*args)
  41. times.append(t.ellapsed)
  42. ave_time = sum(times) / len(times)
  43. return ave_time
  44.  
  45.  
  46. def all_dict_combinations(varied_dict):
  47. tups_list = [[(key, val) for val in val_list]
  48. if isinstance(val_list, (list))
  49. else [(key, val_list)]
  50. for (key, val_list) in sorted(varied_dict.items())]
  51. dict_list = [dict(tups) for tups in it.product(*tups_list)]
  52. return dict_list
  53.  
  54.  
  55. def new_labels_inertia_precompute_dense(X, x_squared_norms, centers, batch_size):
  56. n_samples = X.shape[0] # NOQA
  57. metric_kwargs = dict(squared=True)
  58. labels, mindist = pairwise_distances_argmin_min(
  59. batch_size=batch_size,
  60. X=X, Y=centers, metric='euclidean', metric_kwargs=metric_kwargs)
  61. labels = labels.astype(np.int32)
  62. # Dont bother timing the lines that did not change
  63. # if n_samples == distances.shape[0]:
  64. # # distances will be changed in-place
  65. # distances[:] = mindist
  66. # inertia = mindist.sum()
  67. # return labels, inertia
  68.  
  69.  
  70. def old_labels_inertia_precompute_dense(X, x_squared_norms, centers):
  71. n_samples = X.shape[0]
  72. k = centers.shape[0]
  73. all_distances = euclidean_distances(centers, X, x_squared_norms,
  74. squared=True)
  75. labels = np.empty(n_samples, dtype=np.int32)
  76. labels.fill(-1)
  77. mindist = np.empty(n_samples)
  78. mindist.fill(np.infty)
  79. for center_id in range(k):
  80. dist = all_distances[center_id]
  81. labels[dist < mindist] = center_id
  82. mindist = np.minimum(dist, mindist)
  83. # Dont bother timing the lines that did not change
  84. # if n_samples == distances.shape[0]:
  85. # # distances will be changed in-place
  86. # distances[:] = mindist
  87. # inertia = mindist.sum()
  88. # return labels, inertia
  89.  
  90.  
  91. def make_X(n_clusters=2000, n_features=128, n_samples=10, dtype=np.float32):
  92. rng = np.random.RandomState(42)
  93. X = rng.rand(n_samples, n_features).astype(dtype)
  94. x_squared_norms = row_norms(X)
  95. centers = rng.rand(n_clusters, n_features).astype(dtype)
  96. return X, x_squared_norms, centers
  97.  
  98.  
  99. def single_benchmark(n_clusters, n_features, n_samples, batch_size=500,
  100. dtype=np.float32, niters=10):
  101. X, x_squared_norms, centers = make_X(n_clusters, n_features, n_samples, dtype)
  102. dtype_bytes = dtype(0).nbytes
  103. measures = {}
  104. size_old = X.shape[0] * centers.shape[0]
  105. size_new = min(batch_size, X.shape[0]) * centers.shape[0]
  106. # print(X.shape)
  107. # print(centers.shape)
  108. measures['MB_old'] = (size_old * dtype_bytes) / 2 ** 20
  109. measures['MB_new'] = (size_new * dtype_bytes) / 2 ** 20
  110. # print('measures = %r' % (measures,))
  111. measures['old_speed'] = time_func(
  112. (old_labels_inertia_precompute_dense,
  113. X, x_squared_norms, centers), niters)
  114. measures['new_speed'] = time_func(
  115. (new_labels_inertia_precompute_dense,
  116. X, x_squared_norms, centers, batch_size), niters)
  117. return measures
  118.  
  119.  
  120. def run_benchmark_grid(basis, name):
  121. print('Running %s benchmark' % (name,))
  122. import pandas as pd
  123. pd.options.display.max_rows = 1000
  124. pd.options.display.width = 1000
  125. vals = []
  126. try:
  127. import utool as ut
  128. ProgIter = ut.ProgIter
  129. except ImportError:
  130. def ProgIter(x):
  131. return x
  132. for kw in ProgIter(all_dict_combinations(basis)):
  133. # print('---------')
  134. # print('kw = %r' % (kw,))
  135. measures = single_benchmark(**kw)
  136. kw.update(measures)
  137. vals.append(kw)
  138.  
  139. print('====')
  140. print('Results for %s benchmark' % (name,))
  141. df = pd.DataFrame.from_dict(vals)
  142. df['percent_change'] = 100 * (df['old_speed'] - df['new_speed']) / df['old_speed']
  143. new_keys = ['MB_new', 'MB_old', 'new_speed', 'old_speed', 'percent_change']
  144. old_keys = sorted(set(df.columns) - set(new_keys))
  145. df = df.reindex_axis(old_keys + new_keys, axis=1)
  146. df['absolute_change'] = (df['old_speed'] - df['new_speed'])
  147. print(df.sort_values('absolute_change', ascending=False))
  148.  
  149.  
  150. def main():
  151. small_cluster_basis = {
  152. 'n_clusters': [2, 5, 10][::-1],
  153. 'n_features': [16, 32, 128][::-1],
  154. 'n_samples': [10, 20, 100, 1000, 50000][::-1],
  155. 'niters': [100],
  156. }
  157. run_benchmark_grid(small_cluster_basis, 'small clusters')
  158.  
  159. large_basis = {
  160. 'n_clusters': [10, 100, 1000][::-1],
  161. 'n_features': [16, 32, 128][::-1],
  162. 'n_samples': [10, 100, 1000, 10000, 50000][::-1],
  163. 'niters': [5],
  164. }
  165. run_benchmark_grid(large_basis, 'large clusters test')
  166.  
  167. batch_size_basis = {
  168. 'n_clusters': [1000][::-1],
  169. 'n_features': [32][::-1],
  170. 'n_samples': [10, 100, 1000, 10000][::-1],
  171. 'batch_size': [100, 500, 1000],
  172. 'niters': [5],
  173. }
  174. run_benchmark_grid(batch_size_basis, 'batch_size test')
  175.  
  176. if __name__ == '__main__':
  177. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement