1. #!/usr/bin/env python
2. import numpy as np
3. import pandas as pd
4. from common import describe_data, test_env
5. from sklearn.cluster import KMeans
6. from sklearn.preprocessing import MinMaxScaler
7. import matplotlib.pyplot as plt
8. from sklearn.manifold import TSNE
9.
10.
12.     try:
14.     except FileNotFoundError:
16.
17.
18. def print_info(credit_info):
19.     print("##Credit info")
20.     print(credit_info.info())
21.     print("\n##dataframe shape")
22.     print(credit_info.shape)
23.     print("\n##dataframe columns")
24.     print(credit_info.columns)
27.     print(credit_info.tail())
28.     print("\n##numeric value statistics")
29.     print(credit_info.describe())
30.
31.
32. def plot_clusters(X, y, figure, file):
33.     colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
34.               'tab:brown', 'tab:pink', 'tab:olive']
35.     markers = ['o', 'X', 's', 'D']
36.     color_idx = 0
37.     marker_idx = 0
38.
39.     plt.figure(figure)
40.
41.     for cluster in range(0, len(set(y))):
42.         plt.scatter(X[y == cluster, 0], X[y == cluster, 1],
43.                     s=5, c=colors[color_idx], marker=markers[marker_idx])
44.         color_idx = 0 if color_idx == (len(colors) - 1) else color_idx + 1
45.         marker_idx = 0 if marker_idx == (len(markers) - 1) else marker_idx + 1
46.
47.     plt.title(figure)
48.     plt.xticks([])
49.     plt.yticks([])
50.     plt.savefig(file, papertype='a4')
51.
52.     plt.show()
53.
54.
55. def clustering(df):
56.     Sum_of_squared_distances = []
57.     K = range(1, 15)
58.     for k in K:
59.         km = KMeans(n_clusters=k, init='k-means++', random_state=0)
60.         km = km.fit(df)
61.         Sum_of_squared_distances.append(km.inertia_)
62.     plt.plot(K, Sum_of_squared_distances)
63.     plt.xlabel('number of clusters')
64.     plt.ylabel('WCSS')
65.     plt.grid()
66.     plt.title('The Elbow Method')
67.     plt.savefig('results/cc_wcss_plot.png', papertype='a4')
68.     plt.show()
69.
70.
71. if __name__ == '__main__':
72.     modules = ['numpy', 'pandas', 'sklearn']
73.     test_env.versions(modules)
75.     print_info(df)
76.     df = df.drop(columns="CUST_ID")
77.     df = df.fillna(df.mean())
78.     X = df.values
79.     cs = df.columns
80.     for c in cs:
81.         print("\nunique values for" + c + " is")
82.         print(df[c].unique())
83.     mms = MinMaxScaler()
84.     mms.fit(df)
85.     df_transformed = mms.transform(df)
86.     clustering(X)
87.     n_clusters = 8
88.     k_means = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
89.     y_kmeans = k_means.fit_predict(X)
90.     X_tsne = TSNE(n_components=2, random_state=0).fit_transform(X)
91.
92.     plot_clusters(X_tsne, np.full(X_tsne.shape[0], 0),
93.                   't-SNE visualisation without clusters', 'results/cc_tsne_no_clusters.png')
94.     plot_clusters(X_tsne, y_kmeans, 'k means clusters with TSNE',
95.                   'results/cc_tsne_X_clusters.png')
