Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
224
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.17 KB | None | 0 0
  1. import pandas as pd
  2. from dateutil.parser import parse
  3. import matplotlib.pyplot as plt
  4. def TienXuLy():
  5.     # Tiền xử lý
  6.     data = pd.read_csv("/home/huyphuong99/Documents/data_set.csv")
  7.     data = data[data["Customer ID"].notnull()]
  8.     data = data.drop(index = data[data['Invoice'].astype(str).str[0] == 'C'].index)
  9.  
  10.     # chuan hoa sang ngay
  11.     date_time = []
  12.     X = data.values  # mảng numpy X lưu gía trị của file data.csv
  13.     for i in range(len(X[:, 4])):
  14.         date = parse(X[i, 4])
  15.         date_time.append(date)
  16.  
  17.     # Tim ngay max
  18.     max_date = date_time[0]
  19.     for i in range(len(date_time)):
  20.         if (max_date < date_time[i]):
  21.             max_date = date_time[i]
  22.  
  23.     for i in range(len(date_time)):
  24.         date_time[i] = max_date - date_time[i]
  25.  
  26.     data["Inv_Day"] = date_time
  27.     data["Inv_Day"] = data["Inv_Day"].astype("timedelta64[D]")
  28.     data["Total_Bill"] = data.Quantity * data.Price
  29.     RFM = data.groupby("Customer ID", as_index = False).agg({"Inv_Day": "min", "Invoice": "count", "Total_Bill": "sum"})
  30.     RFM.rename(columns = {"Inv_Day": "Recency", "Invoice": "Frequency", "Total_Bill": "Mometary"}, inplace = True)
  31.  
  32.     R = RFM["Recency"].tolist()
  33.     for i in range(len(R)):
  34.         R[i] = -R[i]
  35.  
  36.     F = RFM["Frequency"].tolist()
  37.     m = RFM["Mometary"].tolist()
  38.     #0->100
  39.     M = []
  40.     for i in range(len(m)):
  41.         t = (m[i] - min(m)) / (max(m) - min(m)) * (100 - 0) + 0
  42.         M.append(t)
  43.     ID = RFM["Customer ID"].tolist()
  44.  
  45.     X = []
  46.     for i in range(len(ID)):
  47.         X.append([R[i],F[i],M[i]])
  48.     return X
  49.  
  50.  
  51. import random
  52. import math
  53. import numpy as np
  54.  
  55. def eucldist(p0, p1):
  56.     dist = 0.0
  57.     for i in range(0, len(p0)):
  58.         dist += (p0[i] - p1[i]) ** 2
  59.     return math.sqrt(dist)
  60.  
  61. def kmeans(k, datapoints):
  62.  
  63.     d = len(datapoints[0])
  64.     i = 0
  65.     cluster = [0] * len(datapoints)
  66.     prev_cluster = [-1] * len(datapoints)
  67.  
  68.     cluster_centers = []
  69.     for i in range(0, k):
  70.  
  71.          cluster_centers += [random.choice(datapoints)]
  72.  
  73.          force_recalculation = False
  74.     while (cluster != prev_cluster) or (force_recalculation):
  75.  
  76.         prev_cluster = list(cluster)
  77.         force_recalculation = False
  78.         i += 1
  79.  
  80.         for p in range(0, len(datapoints)):
  81.             min_dist = float("inf")
  82.  
  83.             for c in range(0, len(cluster_centers)):
  84.                 dist = eucldist(datapoints[p], cluster_centers[c])
  85.                 if (dist < min_dist):
  86.                     min_dist = dist
  87.                     cluster[p] = c
  88.         for k in range(0, len(cluster_centers)):
  89.             new_center = [0] * d
  90.             members = 0
  91.             for p in range(0, len(datapoints)):
  92.                 if (cluster[p] == k):
  93.                     for j in range(0, d):
  94.                         new_center[j] += datapoints[p][j]
  95.                     members += 1
  96.             for j in range(0, d):
  97.                 if members != 0:
  98.                     new_center[j] = new_center[j] / members
  99.                 else:
  100.                     new_center = random.choice(datapoints)
  101.                     force_recalculation = True
  102.                     print("Forced Recalculation...")
  103.             cluster_centers[k] = new_center
  104.  
  105.     #print("======== Results ========")
  106.     #print("Clusters", cluster_centers)
  107.     #print("Iterations", i)
  108.     #print("Assignments", cluster)
  109.     return cluster_centers, cluster
  110.  
  111.  
  112. if __name__ == "__main__":
  113.     datapoints = TienXuLy()
  114.     # K - Number of Clusters
  115.     sse = []
  116.     K = range(1,16)
  117.     for k in K:
  118.         lable, cluster = kmeans(k, datapoints)
  119.         W = 0
  120.         for i in range(k):
  121.             w = 0
  122.             for j in range(len(cluster)):
  123.                 if i == cluster[j]:
  124.                     dist_ct = eucldist(lable[i],datapoints[j])
  125.                     w += dist_ct
  126.             W += w
  127.         print(k)
  128.         print(W)
  129.         sse.append(W)
  130.     plt.plot(K, sse, 'bx-')
  131.     plt.xlabel('\nValues of K\n')
  132.     plt.ylabel('Distortion')
  133.     plt.title('\nThe Elbow Method using Distortion\n')
  134.     plt.show()
  135.     print("\n")
  136.     '''k =
  137.    labelM, clusterM = kmeans(k,datapoints)
  138.    print(labelM)
  139.    print(clusterM)'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement