Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from __future__ import print_function, division, absolute_import, unicode_literals
- import time
- import itertools as it
- from sklearn.utils.extmath import row_norms
- from sklearn.metrics.pairwise import euclidean_distances
- from sklearn.metrics.pairwise import pairwise_distances_argmin_min
- import numpy as np
- class Timer(object):
- def __init__(self):
- self.tstart = -1
- self.ellapsed = -1
- self.default_timer = time.time
- def tic(self):
- self.tstart = self.default_timer()
- def toc(self):
- ellapsed = (self.default_timer() - self.tstart)
- return ellapsed
- def __enter__(self):
- self.tic()
- return self
- def __exit__(self, type_, value, trace):
- self.ellapsed = self.toc()
- if trace is not None:
- return False
- def time_func(func_tup, iters=10):
- times = []
- func = func_tup[0]
- args = func_tup[1:]
- for i in range(iters):
- with Timer() as t:
- func(*args)
- times.append(t.ellapsed)
- ave_time = sum(times) / len(times)
- return ave_time
- def all_dict_combinations(varied_dict):
- tups_list = [[(key, val) for val in val_list]
- if isinstance(val_list, (list))
- else [(key, val_list)]
- for (key, val_list) in sorted(varied_dict.items())]
- dict_list = [dict(tups) for tups in it.product(*tups_list)]
- return dict_list
- def new_labels_inertia_precompute_dense(X, x_squared_norms, centers, batch_size):
- n_samples = X.shape[0] # NOQA
- metric_kwargs = dict(squared=True)
- labels, mindist = pairwise_distances_argmin_min(
- batch_size=batch_size,
- X=X, Y=centers, metric='euclidean', metric_kwargs=metric_kwargs)
- labels = labels.astype(np.int32)
- # Dont bother timing the lines that did not change
- # if n_samples == distances.shape[0]:
- # # distances will be changed in-place
- # distances[:] = mindist
- # inertia = mindist.sum()
- # return labels, inertia
- def old_labels_inertia_precompute_dense(X, x_squared_norms, centers):
- n_samples = X.shape[0]
- k = centers.shape[0]
- all_distances = euclidean_distances(centers, X, x_squared_norms,
- squared=True)
- labels = np.empty(n_samples, dtype=np.int32)
- labels.fill(-1)
- mindist = np.empty(n_samples)
- mindist.fill(np.infty)
- for center_id in range(k):
- dist = all_distances[center_id]
- labels[dist < mindist] = center_id
- mindist = np.minimum(dist, mindist)
- # Dont bother timing the lines that did not change
- # if n_samples == distances.shape[0]:
- # # distances will be changed in-place
- # distances[:] = mindist
- # inertia = mindist.sum()
- # return labels, inertia
- def make_X(n_clusters=2000, n_features=128, n_samples=10, dtype=np.float32):
- rng = np.random.RandomState(42)
- X = rng.rand(n_samples, n_features).astype(dtype)
- x_squared_norms = row_norms(X)
- centers = rng.rand(n_clusters, n_features).astype(dtype)
- return X, x_squared_norms, centers
- def single_benchmark(n_clusters, n_features, n_samples, batch_size=500,
- dtype=np.float32, niters=10):
- X, x_squared_norms, centers = make_X(n_clusters, n_features, n_samples, dtype)
- dtype_bytes = dtype(0).nbytes
- measures = {}
- size_old = X.shape[0] * centers.shape[0]
- size_new = min(batch_size, X.shape[0]) * centers.shape[0]
- # print(X.shape)
- # print(centers.shape)
- measures['MB_old'] = (size_old * dtype_bytes) / 2 ** 20
- measures['MB_new'] = (size_new * dtype_bytes) / 2 ** 20
- # print('measures = %r' % (measures,))
- measures['old_speed'] = time_func(
- (old_labels_inertia_precompute_dense,
- X, x_squared_norms, centers), niters)
- measures['new_speed'] = time_func(
- (new_labels_inertia_precompute_dense,
- X, x_squared_norms, centers, batch_size), niters)
- return measures
- def run_benchmark_grid(basis, name):
- print('Running %s benchmark' % (name,))
- import pandas as pd
- pd.options.display.max_rows = 1000
- pd.options.display.width = 1000
- vals = []
- try:
- import utool as ut
- ProgIter = ut.ProgIter
- except ImportError:
- def ProgIter(x):
- return x
- for kw in ProgIter(all_dict_combinations(basis)):
- # print('---------')
- # print('kw = %r' % (kw,))
- measures = single_benchmark(**kw)
- kw.update(measures)
- vals.append(kw)
- print('====')
- print('Results for %s benchmark' % (name,))
- df = pd.DataFrame.from_dict(vals)
- df['percent_change'] = 100 * (df['old_speed'] - df['new_speed']) / df['old_speed']
- new_keys = ['MB_new', 'MB_old', 'new_speed', 'old_speed', 'percent_change']
- old_keys = sorted(set(df.columns) - set(new_keys))
- df = df.reindex_axis(old_keys + new_keys, axis=1)
- df['absolute_change'] = (df['old_speed'] - df['new_speed'])
- print(df.sort_values('absolute_change', ascending=False))
- def main():
- small_cluster_basis = {
- 'n_clusters': [2, 5, 10][::-1],
- 'n_features': [16, 32, 128][::-1],
- 'n_samples': [10, 20, 100, 1000, 50000][::-1],
- 'niters': [100],
- }
- run_benchmark_grid(small_cluster_basis, 'small clusters')
- large_basis = {
- 'n_clusters': [10, 100, 1000][::-1],
- 'n_features': [16, 32, 128][::-1],
- 'n_samples': [10, 100, 1000, 10000, 50000][::-1],
- 'niters': [5],
- }
- run_benchmark_grid(large_basis, 'large clusters test')
- batch_size_basis = {
- 'n_clusters': [1000][::-1],
- 'n_features': [32][::-1],
- 'n_samples': [10, 100, 1000, 10000][::-1],
- 'batch_size': [100, 500, 1000],
- 'niters': [5],
- }
- run_benchmark_grid(batch_size_basis, 'batch_size test')
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement