Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import sklearn
- import math
- # Read Data - Split into target and features = cdata
- cdata = pd.read_csv("./cdata.txt")
- print("cdata summary = ")
- print(cdata.describe())
- print()
- print()
- target = cdata.loc[:, "Y"]
- cdata = cdata.loc[:, ["X1", "X2"]]
- # Draw the different classes
- plt.figure()
- plt.scatter(cdata[target == 1].X1, cdata[target == 1].X2, color="red", marker="o", label="1")
- plt.scatter(cdata[target == 2].X1, cdata[target == 2].X2, color="blue", marker="o", label="2")
- plt.scatter(cdata[target == 3].X1, cdata[target == 3].X2, color="green", marker="o", label="3")
- plt.title("Initial Data")
- plt.xlabel("X1")
- plt.ylabel("X2")
- plt.legend()
- plt.show()
- # Elbow method - Cohesion = SSE
- from sklearn.cluster import KMeans
- sse = []
- RANGE = range(1, 11)
- for i in RANGE:
- # sse list will contain the inertia_ of KMeans or cohesion or SSE
- sse.append(KMeans(n_clusters=i, init=cdata.loc[0:i-1, :]).fit(cdata).inertia_)
- plt.figure()
- plt.plot(RANGE, sse) # Solid line
- plt.scatter(RANGE, sse, marker="o") # Points
- plt.title("SSE = Cohesion for every K")
- plt.xlabel("K")
- plt.ylabel("SSE")
- plt.show()
- # Select K = 3 for kmeans and calculate metrics
- K = 3
- kmeans = KMeans(n_clusters=K, init=cdata.loc[0:K-1, :])
- kmeans = kmeans.fit(cdata)
- cohesion = kmeans.inertia_
- print("K = " + str(K))
- print("Centroids = ")
- print(kmeans.cluster_centers_)
- print("Labels = " + str(kmeans.labels_))
- print("Cohesion = " + str(cohesion))
- separation = 0
- distance = lambda x1, x2: math.sqrt(((x1.X1 - x2.X1) ** 2) + ((x1.X2 - x2.X2) ** 2))
- m = cdata.mean()
- for i in list(set(kmeans.labels_)):
- mi = cdata.loc[kmeans.labels_ == i, :].mean()
- Ci = len(cdata.loc[kmeans.labels_ == i, :].index)
- separation += Ci * (distance(m, mi) ** 2)
- print("Separation = " + str(separation))
- print("CSS + BSS = " + str(cohesion + separation))
- print()
- print()
- # Draw Centroids and Points with the same color
- plt.figure()
- plt.scatter(cdata.X1, cdata.X2, c=kmeans.labels_)
- plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], marker="+", s=169, c="black")
- plt.title("Clustering with K = " + str(K))
- plt.xlabel("X1")
- plt.ylabel("X2")
- plt.show()
- # Silhouette and Silhouette Plot
- from sklearn.metrics import silhouette_samples, silhouette_score
- print("Average Silhouette for each cluster: ")
- for i in range(K):
- print(np.mean(silhouette_samples(cdata, kmeans.labels_)[kmeans.labels_ == i]))
- print("Average Silhouette for the whole clustering = " + str(silhouette_score(cdata, kmeans.labels_)))
- print()
- # Silhouette Visualizer - Yellowbrick Library
- import yellowbrick
- from yellowbrick.cluster import SilhouetteVisualizer
- visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
- visualizer.fit(cdata)
- visualizer.show()
- # Heatmap
- cdata["cluster"] = kmeans.labels_
- cdata = cdata.sort_values("cluster").drop("cluster", axis=1)
- from scipy.spatial import distance_matrix
- dist = distance_matrix(cdata, cdata)
- plt.imshow(dist, cmap='hot')
- plt.colorbar()
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement