Advertisement
Guest User

Untitled

a guest
Mar 21st, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.41 KB | None | 0 0
  1. import statsmodels.api as sm
  2. from scipy import stats
  3. stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
  4. import pandas as pd
  5. from matplotlib import pyplot as plt
  6. import seaborn as sns
  7. import numpy as np
  8. from matplotlib import rcParams
  9. import scipy.stats as stats
  10. import pylab
  11. from sklearn.linear_model import LogisticRegression
  12. from sklearn.linear_model import LinearRegression
  13. from sklearn.tree import DecisionTreeClassifier
  14. from sklearn.ensemble import RandomForestClassifier
  15. from sklearn.ensemble import RandomForestRegressor
  16. from sklearn.svm import SVC
  17. from sklearn.metrics import confusion_matrix
  18. from sklearn.metrics import roc_curve
  19. from sklearn.model_selection import train_test_split
  20. from sklearn.metrics import accuracy_score
  21. from sklearn.metrics import confusion_matrix
  22. import math
  23. import re
  24. rcParams['font.family'] = 'sans-serif'
  25. rcParams['font.sans-serif'] = ['Roboto']
  26. rcParams['font.size'] = 20
  27. from matplotlib.colors import ListedColormap
  28. from sklearn import datasets
  29. from random import randint
  30. from nltk.corpus import stopwords
  31.  
  32. col = ['#ef4631', '#10b9ce', '#2292ec', '#ff9138', '#3f50b0', '#f7bb09']
  33. cols_light = ['#f47f71', '#72dde9','#66b4f2','#ffb478','#7a86c8','#f9cf55']
  34.  
  35. def plot_scatter_2d(df, x1, x2, y, ax=None):
  36. """"
  37. Description: Plots a scatter plot of two dependent variables, with different color/shape per class
  38. IMPORTRANT: Works for only 2 independent variables, given 1 dependent
  39.  
  40. Parameters:
  41. df(n x m): dataframe containing variables of interest
  42. x1 (string): name of x-axis variable
  43. x2 (string): name of y-axis variable
  44. y (string): name of labels
  45.  
  46. Returns:
  47. Scatter plot
  48. """
  49. if ax == None:
  50. f, ax = plt.subplots(1, figsize = (10,8))
  51.  
  52. for idx, cl in enumerate(np.unique(df[y])):
  53.  
  54. ax.scatter(x=df[df[y]==cl][x1],
  55. y=df[df[y]==cl][x2],
  56. label = cl,
  57. c = col[idx],
  58. s = 100)
  59.  
  60.  
  61. if ax == None:
  62. plt.title(x1 + ' vs ' + x2)
  63. plt.legend()
  64. plt.xlabel(x1)
  65. plt.ylabel(x2)
  66. plt.show()
  67.  
  68. def plot_assignments(df, mean1, mean2, mean3, feature1, feature2, wLabel = False):
  69.  
  70. """
  71. Function to plot the movement of the centroids as well as the point assignments per iteration.
  72. Note that this is just to show the method, and does not really need to be done by you.
  73.  
  74. Parameters
  75. -------------------------
  76. df : The dataframe containing your data
  77. mean1 : centroid 1
  78. mean2 : centroid 2
  79. mean3 : centroid 3
  80. wLabel : Option to show the labels or not
  81. """
  82. col_ex = ['#ef4631', '#2292ec', '#3f50b0']
  83. labelling = ['setosa', 'versicolor', 'virginica']
  84. temp_df = df.copy()
  85. cluster_mem = []
  86.  
  87. for i in range(df.shape[0]):
  88. pt = (df[feature1][i],df[feature2][i])
  89.  
  90. dist1 = np.sqrt((pt[0]-mean1[0])**2 + (pt[1]-mean1[1])**2)
  91. dist2 = np.sqrt((pt[0]-mean2[0])**2 + (pt[1]-mean2[1])**2)
  92. dist3 = np.sqrt((pt[0]-mean3[0])**2 + (pt[1]-mean3[1])**2)
  93.  
  94. cluster_mem.append(np.argmin([dist1, dist2, dist3]))
  95.  
  96. temp_df['Cluster'] = cluster_mem
  97.  
  98. if wLabel == False:
  99. f, ax = plt.subplots(1, figsize = (8,6))
  100. for idx, cl in enumerate(np.unique(temp_df['Cluster'])):
  101.  
  102. ax.scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
  103. y=temp_df[temp_df['Cluster']==cl][feature2],
  104. c = col_ex[idx],
  105. label=labelling[idx],
  106. s = 100, alpha = 0.1)
  107.  
  108. ax.scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
  109. ax.scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
  110. ax.scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)
  111.  
  112. #plt.legend()
  113. plt.xlabel(feature1)
  114. plt.ylabel(feature2)
  115.  
  116. plt.show()
  117.  
  118. else:
  119.  
  120. f, ax = plt.subplots(1,2, figsize = (20, 10))
  121. for idx, cl in enumerate(np.unique(temp_df['Cluster'])):
  122.  
  123. ax[0].scatter(x=temp_df[temp_df['Cluster']==cl][feature1],
  124. y=temp_df[temp_df['Cluster']==cl][feature2],
  125. c = col_ex[idx],
  126. s = 100, alpha = 0.1)
  127.  
  128. ax[0].scatter(x = mean1[0], y = mean1[1], color = col_ex[0], s = 200)
  129. ax[0].scatter(x = mean2[0], y = mean2[1], color = col_ex[1], s = 200)
  130. ax[0].scatter(x = mean3[0], y = mean3[1], color = col_ex[2], s = 200)
  131.  
  132. ax[0].set_title('Clustering Result')
  133. ax[1].set_title('Actual Data')
  134.  
  135. ax[0].set_xlabel([feature1])
  136. ax[0].set_ylabel([feature2])
  137. ax[1].set_xlabel([feature1])
  138. ax[1].set_ylabel([feature2])
  139.  
  140.  
  141. plt.xlabel(feature1)
  142. plt.ylabel(feature2)
  143.  
  144. for idx, cl in enumerate(np.unique(temp_df['species'])):
  145.  
  146. ax[1].scatter(x=temp_df[temp_df['species']==cl][feature1],
  147. y=temp_df[temp_df['species']==cl][feature2],
  148. c = col_ex[idx],
  149. label=labelling[idx],
  150. s = 100)
  151.  
  152. plt.legend()
  153. plt.show()
  154.  
  155. new_mean1 = temp_df[temp_df.Cluster == 0].mean(axis = 0)
  156. new_mean2 = temp_df[temp_df.Cluster == 1].mean(axis = 0)
  157. new_mean3 = temp_df[temp_df.Cluster == 2].mean(axis = 0)
  158. print(new_mean1, new_mean2, new_mean3)
  159. return (new_mean1, new_mean2, new_mean3)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement