Untitled

from sklearn import cluster, metrics
import numpy as np
from matplotlib import pyplot, mlab

def kmeans(cluster_input):

    n_clusters = 2
    k_means = cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(cluster_input)

    plot_n_clusters(n_clusters,k_means)
    sil_score(cluster_input,k_means.labels_)

def shiftmeans(cluster_input):

    bandwidth = cluster.estimate_bandwidth(cluster_input, quantile=0.5, n_samples=None)

    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(cluster_input)
    labels = ms.labels_
    labels_unique = np.unique(labels)
    n_clusters = len(labels_unique)
    print("Number of estimated clusters : ",  str(n_clusters))

    plot_n_clusters(n_clusters,ms)
    sil_score(cluster_input,ms.labels_)

def plot_n_clusters(n_clusters,cluster_obj):

    all_colors = ['red', 'blue','green','purple', 'magenta', 'yellow', 'cyan', 'orange', 'black', 'gray', 'brown']

    pyplot.figure()
    for k, col in zip(range(n_clusters), all_colors[:n_clusters]):
        my_members = cluster_obj.labels_== k
        cluster_center = cluster_obj.cluster_centers_[k]
        pyplot.plot(cluster_input[my_members, 0], cluster_input[my_members, 1], 'w',
                markerfacecolor=col, marker='.')
        pyplot.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                markeredgecolor='k', markersize=6)

    description = str(type(cluster_obj)).split('.')
    pyplot.title(description[-1:])

    pyplot.xlabel('var1')
    pyplot.ylabel('var2')
    pyplot.show()

def sil_score(cluster_input,labels):

    sil = metrics.silhouette_score(cluster_input, labels, sample_size=1000)
    print('Silhouette score: ', str(sil))
    return sil

filename = "your_file_name_with_path"
dat = mlab.csv2rec(filename)
cluster_input = np.vstack([dat['col_name1'],dat['col_name2']]).transpose() # columns must be the same length

kmeans(cluster_input)
shiftmeans(cluster_input)