Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from dateutil.parser import parse
- import matplotlib.pyplot as plt
- def TienXuLy():
- # Tiền xử lý
- data = pd.read_csv("/home/huyphuong99/Documents/data_set.csv")
- data = data[data["Customer ID"].notnull()]
- data = data.drop(index = data[data['Invoice'].astype(str).str[0] == 'C'].index)
- # chuan hoa sang ngay
- date_time = []
- X = data.values # mảng numpy X lưu gía trị của file data.csv
- for i in range(len(X[:, 4])):
- date = parse(X[i, 4])
- date_time.append(date)
- # Tim ngay max
- max_date = date_time[0]
- for i in range(len(date_time)):
- if (max_date < date_time[i]):
- max_date = date_time[i]
- for i in range(len(date_time)):
- date_time[i] = max_date - date_time[i]
- data["Inv_Day"] = date_time
- data["Inv_Day"] = data["Inv_Day"].astype("timedelta64[D]")
- data["Total_Bill"] = data.Quantity * data.Price
- RFM = data.groupby("Customer ID", as_index = False).agg({"Inv_Day": "min", "Invoice": "count", "Total_Bill": "sum"})
- RFM.rename(columns = {"Inv_Day": "Recency", "Invoice": "Frequency", "Total_Bill": "Mometary"}, inplace = True)
- R = RFM["Recency"].tolist()
- for i in range(len(R)):
- R[i] = -R[i]
- F = RFM["Frequency"].tolist()
- m = RFM["Mometary"].tolist()
- #0->100
- M = []
- for i in range(len(m)):
- t = (m[i] - min(m)) / (max(m) - min(m)) * (100 - 0) + 0
- M.append(t)
- ID = RFM["Customer ID"].tolist()
- X = []
- for i in range(len(ID)):
- X.append([R[i],F[i],M[i]])
- return X
- import random
- import math
- import numpy as np
- def eucldist(p0, p1):
- dist = 0.0
- for i in range(0, len(p0)):
- dist += (p0[i] - p1[i]) ** 2
- return math.sqrt(dist)
- def kmeans(k, datapoints):
- d = len(datapoints[0])
- i = 0
- cluster = [0] * len(datapoints)
- prev_cluster = [-1] * len(datapoints)
- cluster_centers = []
- for i in range(0, k):
- cluster_centers += [random.choice(datapoints)]
- force_recalculation = False
- while (cluster != prev_cluster) or (force_recalculation):
- prev_cluster = list(cluster)
- force_recalculation = False
- i += 1
- for p in range(0, len(datapoints)):
- min_dist = float("inf")
- for c in range(0, len(cluster_centers)):
- dist = eucldist(datapoints[p], cluster_centers[c])
- if (dist < min_dist):
- min_dist = dist
- cluster[p] = c
- for k in range(0, len(cluster_centers)):
- new_center = [0] * d
- members = 0
- for p in range(0, len(datapoints)):
- if (cluster[p] == k):
- for j in range(0, d):
- new_center[j] += datapoints[p][j]
- members += 1
- for j in range(0, d):
- if members != 0:
- new_center[j] = new_center[j] / members
- else:
- new_center = random.choice(datapoints)
- force_recalculation = True
- print("Forced Recalculation...")
- cluster_centers[k] = new_center
- #print("======== Results ========")
- #print("Clusters", cluster_centers)
- #print("Iterations", i)
- #print("Assignments", cluster)
- return cluster_centers, cluster
- if __name__ == "__main__":
- datapoints = TienXuLy()
- # K - Number of Clusters
- sse = []
- K = range(1,16)
- for k in K:
- lable, cluster = kmeans(k, datapoints)
- W = 0
- for i in range(k):
- w = 0
- for j in range(len(cluster)):
- if i == cluster[j]:
- dist_ct = eucldist(lable[i],datapoints[j])
- w += dist_ct
- W += w
- print(k)
- print(W)
- sse.append(W)
- plt.plot(K, sse, 'bx-')
- plt.xlabel('\nValues of K\n')
- plt.ylabel('Distortion')
- plt.title('\nThe Elbow Method using Distortion\n')
- plt.show()
- print("\n")
- '''k =
- labelM, clusterM = kmeans(k,datapoints)
- print(labelM)
- print(clusterM)'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement