Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn import cluster
- from scipy.spatial import distance
- import sklearn.datasets
- from sklearn.preprocessing import StandardScaler
- import numpy as np
- def compute_bic(kmeans,X):
- """
- Computes the BIC metric for a given clusters
- Parameters:
- -----------------------------------------
- kmeans: List of clustering object from scikit learn
- X : multidimension np array of data points
- Returns:
- -----------------------------------------
- BIC value
- """
- # assign centers and labels
- centers = [kmeans.cluster_centers_]
- labels = kmeans.labels_
- #number of clusters
- m = kmeans.n_clusters
- # size of the clusters
- n = np.bincount(labels)
- #size of data set
- N, d = X.shape
- #compute variance for all clusters beforehand
- cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]],
- 'euclidean')**2) for i in range(m)])
- const_term = 0.5 * m * np.log(N) * (d+1)
- BIC = np.sum([n[i] * np.log(n[i]) -
- n[i] * np.log(N) -
- ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
- ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term
- return(BIC)
- path = 'C:/Users/Lionel/Downloads'
- file = 'Wholesale customers data.csv'
- data = pd.read_csv(path + '/'+file)
- X = np.array(data.iloc[:,2 :])
- #Xs = StandardScaler().fit_transform(X)
- ks = range(1,100)
- # run 100 times kmeans and save each result in the KMeans object
- KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(X) for i in ks]
- # now run for each cluster the BIC computation
- BIC = [compute_bic(kmeansi,X) for kmeansi in KMeans]
- kopt = BIC.index(max(BIC)) + 1
- print (BIC)
- print( 'kopt = ' + str(kopt) )
- plt.plot(ks,BIC,'r-o')
- plt.title("BIC vs number of clusters")
- plt.xlabel("# clusters")
- plt.ylabel("# BIC")
- plt.show()
Add Comment
Please, Sign In to add comment