Advertisement
Guest User

unsupervised.py

a guest
Mar 24th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.31 KB | None | 0 0
  1. # source: https://github.com/joshuamorton/Machine-Learning
  2.  
  3. import numpy as np
  4. import argparse
  5. from matplotlib import pyplot as plt
  6.  
  7.  
  8. from sklearn.decomposition.pca import PCA as PCA
  9. from sklearn.decomposition import FastICA as FICA
  10. from sklearn.random_projection import GaussianRandomProjection as RandomProjection
  11. from sklearn.feature_selection import SelectKBest as SKBest
  12. from sklearn.neural_network import MLPClassifier
  13. from sklearn.cluster import KMeans as KM
  14. from sklearn.mixture import GaussianMixture as EM
  15. from sklearn.feature_selection import f_classif
  16. from sklearn import preprocessing
  17.  
  18. from sklearn import metrics
  19.  
  20. from scipy.stats import kurtosis
  21.  
  22. def load(filename):
  23. with open(filename) as data:
  24. instances = [line for line in data if "?" not in line]
  25.  
  26. return np.loadtxt(instances,
  27. delimiter=',')
  28.  
  29. def create_dataset(name, test, train):
  30. training_set = load("data/" + train)
  31. testing_set = load("data/" + test)
  32. train_x, train_y = np.hsplit(training_set, [training_set[0].size-1])
  33. test_x, test_y = np.hsplit(testing_set, [testing_set[0].size-1])
  34.  
  35. if name == "steelfaults":
  36. train_x = preprocessing.scale(train_x)
  37. test_x = preprocessing.scale(test_x)
  38.  
  39. return train_x, train_y, test_x, test_y
  40.  
  41. def pca(name, train_x, train_y, test_x, test_y):
  42. compressor = PCA(n_components = train_x[1].size/2)
  43. compressor.fit(X=train_x)
  44. dim_train_x = compressor.transform(train_x)
  45. dim_test_x = compressor.transform(test_x)
  46. recon_err = np.zeros(train_x[1].size)
  47. for i in range(2,train_x[1].size):
  48. ccompressor = PCA(n_components=i)
  49. ccompressor.fit(X=train_x)
  50. cdim_train_x = ccompressor.transform(train_x)
  51. cre_train_x = ccompressor.inverse_transform(cdim_train_x)
  52. recon_err[i] = metrics.mean_squared_error(train_x,cre_train_x)
  53. print recon_err
  54. plt.plot(recon_err)
  55. title = plt.title("PCA Reconstruction Error")
  56. plt.xlabel("# Components")
  57. plt.ylabel("SSE")
  58. #plt.xlim(range(2,train_x[1].size))
  59. #plt.show()
  60. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  61. plt.clf()
  62.  
  63. plt.plot(compressor.explained_variance_)
  64. title = plt.title("PCA Eigenvalues")
  65. plt.xlabel("Attribute # (Sorted)")
  66. plt.ylabel("Eigenvalue")
  67. plt.xticks(range(compressor.explained_variance_.size))
  68. #plt.show()
  69. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  70. plt.clf()
  71.  
  72. em(name, dim_train_x, train_y, dim_test_x, test_y, add="with PCA ", max_cluster=30)
  73. km(name, dim_train_x, train_y, dim_test_x, test_y, add="with PCA ", max_cluster=30)
  74. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="with PCA ")
  75.  
  76. def ica(name, train_x, train_y, test_x, test_y):
  77. compressor = FICA(n_components = train_x[1].size/2)
  78. compressor.fit(X=train_x)
  79. dim_train_x = compressor.transform(train_x)
  80. dim_test_x = compressor.transform(test_x)
  81.  
  82. for i in range(train_x[1].size):
  83. print kurtosis(train_x.T[i])
  84.  
  85. em(name, dim_train_x, train_y, dim_test_x, test_y, add="with ICA ", max_cluster=30)
  86. km(name, dim_train_x, train_y, dim_test_x, test_y, add="with ICA ", max_cluster=30)
  87. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="with ICA ")
  88.  
  89. def randproj(name, train_x, train_y, test_x, test_y):
  90. compressor = RandomProjection(n_components=5)
  91. compressor.fit(X=train_x)
  92. dim_train_x = compressor.transform(train_x)
  93. dim_test_x = compressor.transform(test_x)
  94.  
  95. recon_err = np.zeros(train_x[1].size)
  96. for i in range(2,train_x[1].size):
  97. for j in range(10):
  98. ccompressor = RandomProjection(n_components=i)
  99. ccompressor.fit(X=train_x)
  100. cdim_train_x = ccompressor.transform(train_x)
  101. cre_train_x = cdim_train_x.dot(ccompressor.components_)
  102. recon_err[i] = recon_err[i] + metrics.mean_squared_error(train_x,cre_train_x)
  103. recon_err = [i / 10 for i in recon_err]
  104. plt.plot(recon_err)
  105. title = plt.title("RP Reconstruction Error")
  106. plt.xlabel("# Components")
  107. print train_x[1].size
  108. #plt.xlim(range(2,train_x[1].size))
  109. plt.ylabel("SSE")
  110. #plt.show()
  111. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  112. plt.clf()
  113.  
  114. em(name, dim_train_x, train_y, dim_test_x, test_y, add="with RP ", max_cluster=30)
  115. km(name, dim_train_x, train_y, dim_test_x, test_y, add="with RP ", max_cluster=30)
  116. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="with RP ")
  117.  
  118.  
  119. def kbest(name, train_x, train_y, test_x, test_y):
  120. compressor = SKBest(score_func=f_classif,k=5)
  121. compressor.fit(X=train_x, y=train_y)
  122. dim_train_x = compressor.transform(train_x)
  123. dim_test_x = compressor.transform(test_x)
  124.  
  125. em(name, dim_train_x, train_y, dim_test_x, test_y, add="with KB ", max_cluster=30)
  126. km(name, dim_train_x, train_y, dim_test_x, test_y, add="with KB ", max_cluster=30)
  127. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="with KB ")
  128.  
  129. def em(name, train_x, train_y, test_x, test_y, add="", max_cluster=5):
  130. clf_loglikely_err = np.zeros(max_cluster + 1)
  131. clf_silhouette_err = np.zeros(max_cluster + 1)
  132. train_homo_err = np.zeros(max_cluster + 1)
  133. test_homo_err = np.zeros(max_cluster + 1)
  134.  
  135.  
  136. for i in range(2, max_cluster + 1):
  137. clf = EM(n_components=i)
  138. clf.fit(train_x)
  139.  
  140. train_y_clf = clf.predict(train_x)
  141. test_y_clf = clf.predict(test_x)
  142.  
  143. train_y.shape = (train_y.shape[0],)
  144. test_y.shape = (test_y.shape[0],)
  145.  
  146. clf_loglikely_err[i] = clf.lower_bound_
  147. clf_silhouette_err[i] = metrics.silhouette_score(train_x, train_y_clf)
  148. train_homo_err[i] = metrics.homogeneity_score(train_y, train_y_clf)
  149. test_homo_err[i] = metrics.homogeneity_score(test_y, test_y_clf)
  150.  
  151.  
  152. fig, ax1 = plt.subplots()
  153. l1,=ax1.plot(clf_silhouette_err)
  154. l2,=ax1.plot(train_homo_err)
  155. l3,=ax1.plot(test_homo_err)
  156. plt.xlabel("Number of Components")
  157. ax1.set_ylabel("Metric Value")
  158. ax2 = ax1.twinx()
  159. l4,=ax2.plot(clf_loglikely_err, 'b-')
  160. ax2.set_ylabel("Log-likely Value")
  161. title = plt.title("Expected Maximization " + add)
  162. plt.xlim(2,max_cluster)
  163. plt.legend([l1,l2,l3,l4],['Silhouette Score', 'Homogeneity Score (training)', 'Homogeneity Score (testing)','Log-Likely'])
  164. fig.tight_layout()
  165. #plt.show()
  166. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  167. plt.clf()
  168.  
  169. clf = EM(n_components=max_cluster)
  170. clf.fit(train_x)
  171. train_y_clf = clf.predict(train_x)
  172. test_y_clf = clf.predict(test_x)
  173.  
  174. dim_train_x = np.c_[ train_x, train_y_clf ]
  175. dim_test_x = np.c_[ test_x, test_y_clf ]
  176. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="on EM" + add)
  177.  
  178. def km(name, train_x, train_y, test_x, test_y, add="", max_cluster=5):
  179. clf_inertia_err = np.zeros(max_cluster + 1)
  180. clf_silhouette_err = np.zeros(max_cluster + 1)
  181. train_homo_err = np.zeros(max_cluster + 1)
  182. test_homo_err = np.zeros(max_cluster + 1)
  183.  
  184. for i in range(2, max_cluster + 1):
  185. clf = KM(n_clusters=i,max_iter=5000)
  186. clf.fit(train_x)
  187.  
  188. train_y_clf = clf.predict(train_x)
  189. test_y_clf = clf.predict(test_x)
  190.  
  191. train_y.shape = (train_y.shape[0],)
  192. test_y.shape = (test_y.shape[0],)
  193.  
  194. clf_inertia_err[i] = clf.inertia_
  195. clf_silhouette_err[i] = metrics.silhouette_score(train_x, train_y_clf)
  196. train_homo_err[i] = metrics.homogeneity_score(train_y, train_y_clf)
  197. test_homo_err[i] = metrics.homogeneity_score(test_y, test_y_clf)
  198.  
  199. fig, ax1 = plt.subplots()
  200. l1,=ax1.plot(clf_silhouette_err)
  201. l2,=ax1.plot(train_homo_err)
  202. l3,=ax1.plot(test_homo_err)
  203. ax1.set_xlabel("Number of Clusters")
  204. ax1.set_ylabel("Metric Value")
  205. ax2 = ax1.twinx()
  206. l4,=ax2.plot(clf_inertia_err, 'b-')
  207. ax2.set_ylabel("Inertia Value")
  208. title = plt.title("k-Means " + add)
  209. plt.xlim(2,max_cluster)
  210. plt.legend([l1,l2,l3,l4],['Silhouette Score', 'Homogeneity Score (training)', 'Homogeneity Score (testing)','Inertia'])
  211. fig.tight_layout()
  212. #plt.show()
  213. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  214. plt.clf()
  215.  
  216.  
  217. clf = KM(n_clusters=max_cluster, max_iter=5000)
  218. clf.fit(train_x)
  219. train_y_clf = clf.predict(train_x)
  220. test_y_clf = clf.predict(test_x)
  221.  
  222. dim_train_x = np.c_[ train_x, train_y_clf ]
  223. dim_test_x = np.c_[ test_x, test_y_clf ]
  224. nn(name, dim_train_x, train_y, dim_test_x, test_y, add="on KM" + add)
  225.  
  226. def nn(name, train_x, train_y, test_x, test_y, add=""):
  227. clf_train_err = np.zeros(10)
  228. clf_test_err = np.zeros(10)
  229.  
  230. train_y.shape = (train_y.shape[0],)
  231. test_y.shape = (test_y.shape[0],)
  232.  
  233. for i in range(0,10):
  234. clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=(102,), max_iter=i*40+100)
  235. print('trainx: ', train_x)
  236. print('trainy: ', train_y)
  237. clf.fit(X=train_x, y=train_y)
  238. clf_train_err[i] = 1 - clf.score(train_x, train_y)
  239. clf_test_err[i] = 1 - clf.score(test_x, test_y)
  240.  
  241. plt.plot(clf_train_err)
  242. plt.plot(clf_test_err)
  243. plt.xticks(range(0,10), [100+i*40 for i in range(0,10)])
  244. title = plt.title("Neural Network " + add)
  245. plt.xlabel("Number of Iterations")
  246. plt.ylabel("Error")
  247. plt.legend(['Training Error', 'Testing Error'])
  248. #plt.show()
  249. plt.savefig(name + "_out/" + title.get_text() +".png", dpi=500)
  250. plt.clf()
  251.  
  252. def vis(name, train_x, train_y, test_x, test_y, add=""):
  253. reduced_data = PCA(n_components=2).fit_transform(train_x)
  254. kmeans = KM(init='k-means++', n_clusters=6)
  255. h = 0.2 # point in the mesh [x_min, x_max]x[y_min, y_max].
  256. if name == 'steelfaults':
  257. kmeans = KM(init='k-means++', n_clusters=7)
  258. em = EM(n_components=7)
  259. h = 0.2
  260. else:
  261. kmeans = KM(init='k-means++', n_clusters=6)
  262. em = EM(n_components=6)
  263. h = 1
  264. for k in range(0,2):
  265. if k==0:
  266. if name == 'steelfaults':
  267. kmeans = KM(init='k-means++', n_clusters=7)
  268. h = 0.2
  269. else:
  270. kmeans = KM(init='k-means++', n_clusters=6)
  271. h = 1
  272. if k==1:
  273. if name == 'steelfaults':
  274. kmeans = EM(n_components=7)
  275. h = 0.2
  276. else:
  277. kmeans = EM(n_components=6)
  278. h = 1
  279. for i in range(0,4):
  280. if i ==0:
  281. reduced_data=PCA(n_components=2).fit_transform(train_x)
  282. if i ==1:
  283. reduced_data = FICA(n_components =2).fit_transform(train_x)
  284. if i ==2:
  285. reduced_data = RandomProjection(n_components =2).fit_transform(train_x)
  286. if i ==3:
  287. reduced_data = SKBest(score_func=f_classif,k=2).fit_transform(train_x, train_y)
  288. kmeans.fit(reduced_data)
  289. print "made data"
  290.  
  291. # Plot the decision boundary. For that, we will assign a color to each
  292. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
  293. y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
  294. xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  295. print "mesh gird"
  296. # Obtain labels for each point in mesh. Use last trained model.
  297. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
  298. print "labels"
  299. # Put the result into a color plot
  300. Z = Z.reshape(xx.shape)
  301. plt.figure(1)
  302. plt.clf()
  303. plt.imshow(Z, interpolation='nearest',
  304. extent=(xx.min(), xx.max(), yy.min(), yy.max()),
  305. cmap=plt.cm.Paired,
  306. aspect='auto', origin='lower')
  307.  
  308. plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
  309. # Plot the centroids as a white X
  310. if k != 1:
  311. centroids = kmeans.cluster_centers_
  312. plt.scatter(centroids[:, 0], centroids[:, 1],
  313. marker='x', s=169, linewidths=3,
  314. color='w', zorder=10)
  315. plt.xlim(x_min, x_max)
  316. plt.ylim(y_min, y_max)
  317. plt.xticks(())
  318. plt.yticks(())
  319. #plt.show()
  320. plt.savefig(name + "_out/" + str(k) + " " + str(i) +".png", dpi=500)
  321. plt.clf()
  322.  
  323. if __name__=="__main__":
  324. parser = argparse.ArgumentParser(description='Run clustering algorithms on stuff')
  325. parser.add_argument("name")
  326. args = parser.parse_args()
  327. name = args.name
  328. train = name+"_train.csv"
  329. test = name+"_test.csv"
  330. train_x, train_y, test_x, test_y = create_dataset(name, test, train)
  331. nn(name, train_x, train_y, test_x, test_y); print 'nn done'
  332. em(name, train_x, train_y, test_x, test_y, max_cluster = 30); print 'em done'
  333. km(name, train_x, train_y, test_x, test_y, max_cluster = 30); print 'km done'
  334. pca(name, train_x, train_y, test_x, test_y); print 'pca done'
  335. ica(name, train_x, train_y, test_x, test_y); print 'ica done'
  336. randproj(name, train_x, train_y, test_x, test_y); print 'randproj done'
  337. kbest(name, train_x, train_y, test_x, test_y); print 'kbest done'
  338. vis(name,train_x[:2000],train_y[:2000],test_x,test_y); print 'vis done'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement