SHARE
TWEET

Untitled

a guest Nov 17th, 2019 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. import numpy as np
  3. import pandas as pd
  4. from common import describe_data, test_env
  5. from sklearn.cluster import KMeans
  6. from sklearn.preprocessing import MinMaxScaler
  7. import matplotlib.pyplot as plt
  8. from sklearn.manifold import TSNE
  9.  
  10.  
  11. def read_data(file):
  12.     try:
  13.         return pd.read_csv(file)
  14.     except FileNotFoundError:
  15.         exit("Error:" + file + " not found")
  16.  
  17.  
  18. def print_info(credit_info):
  19.     print("##Credit info")
  20.     print(credit_info.info())
  21.     print("\n##dataframe shape")
  22.     print(credit_info.shape)
  23.     print("\n##dataframe columns")
  24.     print(credit_info.columns)
  25.     print("\n##dataframe head and tail")
  26.     print(credit_info.head())
  27.     print(credit_info.tail())
  28.     print("\n##numeric value statistics")
  29.     print(credit_info.describe())
  30.  
  31.  
  32. def plot_clusters(X, y, figure, file):
  33.     colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
  34.               'tab:brown', 'tab:pink', 'tab:olive']
  35.     markers = ['o', 'X', 's', 'D']
  36.     color_idx = 0
  37.     marker_idx = 0
  38.  
  39.     plt.figure(figure)
  40.  
  41.     for cluster in range(0, len(set(y))):
  42.         plt.scatter(X[y == cluster, 0], X[y == cluster, 1],
  43.                     s=5, c=colors[color_idx], marker=markers[marker_idx])
  44.         color_idx = 0 if color_idx == (len(colors) - 1) else color_idx + 1
  45.         marker_idx = 0 if marker_idx == (len(markers) - 1) else marker_idx + 1
  46.  
  47.     plt.title(figure)
  48.     plt.xticks([])
  49.     plt.yticks([])
  50.     plt.savefig(file, papertype='a4')
  51.  
  52.     plt.show()
  53.  
  54.  
  55. def clustering(df):
  56.     Sum_of_squared_distances = []
  57.     K = range(1, 15)
  58.     for k in K:
  59.         km = KMeans(n_clusters=k, init='k-means++', random_state=0)
  60.         km = km.fit(df)
  61.         Sum_of_squared_distances.append(km.inertia_)
  62.     plt.plot(K, Sum_of_squared_distances)
  63.     plt.xlabel('number of clusters')
  64.     plt.ylabel('WCSS')
  65.     plt.grid()
  66.     plt.title('The Elbow Method')
  67.     plt.savefig('results/cc_wcss_plot.png', papertype='a4')
  68.     plt.show()
  69.  
  70.  
  71. if __name__ == '__main__':
  72.     modules = ['numpy', 'pandas', 'sklearn']
  73.     test_env.versions(modules)
  74.     df = read_data('data/cc_general.csv')
  75.     print_info(df)
  76.     df = df.drop(columns="CUST_ID")
  77.     df = df.fillna(df.mean())
  78.     X = df.values
  79.     cs = df.columns
  80.     for c in cs:
  81.         print("\nunique values for" + c + " is")
  82.         print(df[c].unique())
  83.     mms = MinMaxScaler()
  84.     mms.fit(df)
  85.     df_transformed = mms.transform(df)
  86.     clustering(X)
  87.     n_clusters = 8
  88.     k_means = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0)
  89.     y_kmeans = k_means.fit_predict(X)
  90.     X_tsne = TSNE(n_components=2, random_state=0).fit_transform(X)
  91.  
  92.     plot_clusters(X_tsne, np.full(X_tsne.shape[0], 0),
  93.                   't-SNE visualisation without clusters', 'results/cc_tsne_no_clusters.png')
  94.     plot_clusters(X_tsne, y_kmeans, 'k means clusters with TSNE',
  95.                   'results/cc_tsne_X_clusters.png')
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top