Guest User

Untitled

a guest
Oct 21st, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.01 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import tensorflow as tf
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. #uncomment below if using Jupyter
  7. #%config InlineBackend.figure_format = 'retina'
  8.  
  9. # get data
  10. df = pd.read_csv('../some/data/path')
  11.  
  12. def k_means_clustering(df_col_1, df_col_2, clusters, steps=100):
  13. '''
  14. Takes in two dataframe columns and outputs plot of clusters.
  15. '''
  16. vec_vals = []
  17. for i in range(len(df_col_1)):
  18. a = df_col_1[i]
  19. b = df_col_2[i]
  20. vec_vals.append([a, b])
  21. v_vals = np.array(vec_vals)
  22. np.random.shuffle(v_vals)
  23.  
  24. sess = tf.Session()
  25. k = clusters
  26. points = v_vals
  27. data = tf.constant(points)
  28.  
  29. # random initial centroids (points shuffled above)
  30. centroids = tf.Variable(data[:k, :])
  31.  
  32. # add k dim to data and n dim to centroids to make matrices compatible
  33. # for array operations instead of loops
  34. data_expanded = tf.expand_dims(data, 0)
  35. centroids_expanded = tf.expand_dims(centroids, 1)
  36.  
  37. # computes squared Euclidean distance between every point and every centroid
  38. # and get closest centroid for each point
  39. allocations = tf.argmin(tf.reduce_sum(tf.square(data_expanded - centroids_expanded), 2), 0)
  40.  
  41. sess.run(tf.global_variables_initializer())
  42. c = 0 # index of centroid
  43. tf.equal(allocations, c)
  44. tf.gather(data, tf.where(tf.equal(allocations, c)))
  45.  
  46. means = tf.concat(
  47. [tf.reduce_mean(
  48. tf.gather(data,
  49. tf.where(tf.equal(allocations, c))), 0) for c in range(k)], 0)
  50.  
  51. update_centroids = tf.assign(centroids, means)
  52.  
  53. for step in range(steps):
  54. _, centroid_values, allocation_values = sess.run([update_centroids, centroids, allocations])
  55.  
  56. clusters_df = pd.DataFrame({df_col_1.name: points[:,0], df_col_2.name: points[:,1], "cluster": allocation_values})
  57. sns.lmplot(df_col_1.name, df_col_2.name, data=clusters_df, fit_reg=False, size=6, hue="cluster")
  58. plt.show()
  59.  
  60.  
  61. k_means_clustering(df['col_name_a'], df['col_name_b'], 3)
Add Comment
Please, Sign In to add comment