Advertisement
AbdealiJK

Classifier mania

Nov 15th, 2015
189
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.54 KB | None | 0 0
  1. import os, sys
  2. from pprint import pprint
  3. from collections import Counter
  4.  
  5. # Change order of sys.path so that pip packages are given higher priority than apt-get
  6. # This uses the newer sklearn version
  7. if ('/usr/local/lib/python2.7/dist-packages' in sys.path
  8.     and (sys.path.index('/usr/local/lib/python2.7/dist-packages') >
  9.          sys.path.index('/usr/lib/python2.7/dist-packages'))):
  10.     sys.path.remove('/usr/local/lib/python2.7/dist-packages')
  11.     sys.path.insert(1, '/usr/local/lib/python2.7/dist-packages')
  12.  
  13. import numpy as np
  14. from scipy import stats
  15. import matplotlib.pyplot as plt
  16.  
  17. import sklearn
  18. import sklearn.cross_validation as cross_validation
  19. from sklearn.lda import LDA
  20. import sklearn.decomposition as decomposition
  21. import sklearn.metrics as metrics
  22. import sklearn.ensemble as ensemble
  23. import sklearn.feature_selection as feature_selection
  24. import sklearn.linear_model as linear_model
  25. import sklearn.cluster as cluster
  26. import sklearn.multiclass as multiclass
  27. import sklearn.svm as svm
  28. import sklearn.tree as tree
  29. import sklearn.naive_bayes as naive_bayes
  30. import sklearn.preprocessing as preprocessing
  31.  
  32.  
  33. def fullprint(*args, **kwargs):
  34.     opt = np.get_printoptions()
  35.     np.set_printoptions(threshold='nan')
  36.     pprint(*args, **kwargs)
  37.     np.set_printoptions(**opt)
  38.  
  39.  
  40.  
  41.  
  42. if os.path.exists("train.npz"):
  43.     npzfile = np.load("train.npz")
  44.     trainX = npzfile['trainX']
  45.     trainY = npzfile['trainY']
  46. else:
  47.     trainX = np.genfromtxt("train_X.csv", delimiter=",")
  48.     trainY = np.genfromtxt("train_Y.csv", delimiter=",")
  49.     np.savez("train.npz", trainX=trainX, trainY=trainY)
  50.  
  51. # Get test data
  52. if os.path.exists("test.npz"):
  53.     npzfile = np.load("test.npz")
  54.     testX = npzfile['testX']
  55. else:
  56.     testX = np.genfromtxt("test_X.csv", delimiter=",")
  57.     np.savez("test.npz", testX=testX)
  58.  
  59. dataX = trainX.copy()
  60. dataY = trainY.copy()
  61. data_testX = testX.copy()
  62.  
  63. NODATA = 0
  64. classes = np.unique(trainY)
  65. num_feats = trainX.shape[1]
  66. num_class = classes.size
  67.  
  68. assert num_feats == testX.shape[1]
  69.  
  70.  
  71.  
  72.  
  73. # Some drawings/plots to understand the data
  74. ####################################################################
  75.  
  76. # print "Plotting class distribution (histogram)"
  77. plt.hist(trainY, bins=num_class)
  78.  
  79.  
  80. # Check gaussian
  81. stats.mstats.normaltest(trainX[1, :] != NODATA)
  82.  
  83.  
  84. # print "Plotting 2D lda features for data"
  85. lda = LDA()
  86. lda.fit(trainX, trainY)
  87. lda_trainX = lda.transform(trainX)
  88.  
  89. pca = decomposition.PCA()
  90. pca.fit(trainX, trainY)
  91. pca_trainX = pca.transform(trainX)
  92. pca_var = pca.explained_variance_ / pca.explained_variance_.sum()
  93. # plt.plot(pca_var.cumsum())
  94.  
  95. reduced_trainX = lda_trainX
  96.  
  97. plt.clf()
  98. plt.scatter(reduced_trainX[:, 0], reduced_trainX[:, 1], alpha=0.3)
  99. plt.savefig("plots/data.svg")
  100.  
  101. # Find means
  102. class_centroids = np.zeros((num_class, reduced_trainX.shape[1]))
  103. for i in classes:
  104.     class_ind, = np.where(trainY == i)
  105.     classY = trainY[class_ind]
  106.     classX = reduced_trainX[class_ind, :]
  107.     plt.clf()
  108.     plt.scatter(classX[:, 0], classX[:, 1], alpha=1)
  109.     plt.savefig("plots/class_" + str(i) + ".svg")
  110.  
  111.     class_centroid = classX.sum(axis=0) * 1.0 / class_ind.size
  112.     class_centroids[i, :] = class_centroid
  113.  
  114. plt.scatter(class_centroids[:, 0], class_centroids[:, 1], alpha=1)
  115. plt.savefig("plots/class_avg.svg")
  116.  
  117. for i in range(num_class):
  118.     plt.clf()
  119.     class_ind, = np.where(trainY == i)
  120.     classY = trainY[class_ind]
  121.     classX = reduced_trainX[class_ind, :]
  122.     plt.scatter(classX[:, 0], classX[:, 1], alpha=1)
  123.     plt.savefig("plots/class_" + str(i) + ".svg")
  124.  
  125.  
  126.  
  127.  
  128.  
  129. # Check which features are constant for which class
  130. inds = []
  131. indicator_zero = np.zeros((num_class, num_feats))
  132. for i in classes:
  133.     class_ind, = np.where(trainY == i)
  134.     classY = trainY[class_ind]
  135.     classX = trainX[class_ind, :]
  136.     zero_ind, = np.where(classX.std(axis=0) < 1E-35)
  137.     # print i, "-", zero_ind
  138.     print i, "-", zero_ind.size
  139.     inds += zero_ind.tolist()
  140.     indicator_zero[i, zero_ind] = 1
  141. len(inds)
  142.  
  143.  
  144.  
  145.  
  146. # Split to test/train
  147. ###################################################################
  148. print "Stratified K Fold"
  149. skf = cross_validation.StratifiedShuffleSplit(
  150.     trainY, n_iter=1, test_size=0.2)
  151. # Currently just using 1 kfold, later can use more.
  152. skf = list(skf)
  153.  
  154. train_index, test_index = skf[0]
  155. trainX, testX = trainX[train_index, :], trainX[test_index, :]
  156. trainY, testY = trainY[train_index], trainY[test_index]
  157.  
  158.  
  159.  
  160.  
  161.  
  162.  
  163. # Preprocessing
  164. ####################################################################
  165.  
  166. # Finding feats which are constant - this wont really matter if LDA is done
  167. #  - 267 diff features are optional in some classes (tot 2074)
  168. #  - the number of const features ranges from 4-60 for each class
  169. #  - feature 1948 is not important for all 100 classes
  170. #     - 1948: 100, 1980: 96, 1971: 93, 2042: 91, 1254: 74
  171. #     - 1464 is the most freqeuntly occuring feature (13428 dp)
  172.  
  173. # all_zero_feats, = np.where( trainX.std(axis=0) < 1e-5 )
  174. # all_nzero_feats = np.array(list(set(range(num_feats)) - set(all_zero_feats)))
  175. # trainX = trainX[:, all_nzero_feats]
  176. # testX = testX[:, all_nzero_feats]
  177. # print "Removing feats", all_zero_feats
  178.  
  179.  
  180.  
  181. # "Box coxxing"
  182. box_lambda = 0.1
  183. NODATA = ( 0 ** box_lambda - 1 ) / box_lambda
  184. trainX = ( np.power(trainX, box_lambda) - 1 ) / box_lambda
  185. testX = ( np.power(testX, box_lambda) - 1 ) / box_lambda
  186.  
  187.  
  188. # Transform to LDA space
  189. lda = LDA()
  190. lda.fit(trainX, trainY)
  191. trainX = lda.transform(trainX)
  192. testX = lda.transform(testX)
  193.  
  194.  
  195.  
  196. # Max of percentage of zeros
  197. percentage_zero = np.zeros((num_class, num_feats))
  198. for i in classes:
  199.     class_ind, = np.where(trainY == i)
  200.     classY = trainY[class_ind]
  201.     classX = trainX[class_ind, :]
  202.     percent_zero = (classX == 0).sum(axis=0) * 1.0 / class_ind.size
  203.     percentage_zero[i, :] = percent_zero
  204.  
  205.  
  206. classes = np.unique(trainY)
  207. num_feats = trainX.shape[1]
  208. num_class = classes.size
  209.  
  210.  
  211.  
  212.  
  213.  
  214.  
  215. # CLUSTER
  216. ####################################################################
  217.  
  218. # KMeans - quite slow
  219. kmeans = cluster.KMeans(n_clusters=100, max_iter=100, n_init=10, random_state=0)
  220. kmeans.fit(trainX)
  221. predY = kmeans.predict(testX)
  222.  
  223. # Find the dist between clusters to gauge the separatability
  224. means = kmeans.cluster_centers_
  225. cluster_dist = np.zeros((means.shape[0], means.shape[0]))
  226. for i in xrange(means.shape[0]):
  227.     for j in xrange(i, means.shape[0]):
  228.         cluster_dist[i, j] = cluster_dist[j, i] = \
  229.             np.sqrt(np.square(means[i, :] - means[j, :]).sum())
  230.  
  231. # Birch - Never completed. takes too long
  232. birch = cluster.Birch(n_clusters=2)
  233. birch.fit(trainX)
  234. predY = birch.predict(testX)
  235.  
  236. # DBScan
  237. # This doesnt make sense, as in dbscan we cannot train and then test, because
  238. # the clusters don't have a definition. Also, each test/train will have diff
  239. # numbers for min_samples and epsilon
  240. dbscan = cluster.DBSCAN(eps=1, min_samples=4)
  241. # dbscan.fit(trainX) # This canot be used to predict later.
  242. predY = dbscan.fit_predict(testX)
  243.  
  244. # Spectral - Very slow, eats up crazy RAM to make the graph
  245. # What is Spectral CoClustering ?
  246. spectral = cluster.SpectralClustering(n_clusters=2)
  247. spectral.fit(trainX)
  248. predY = spectral.predict(testX)
  249.  
  250.  
  251.  
  252. confusion = metrics.confusion_matrix(testY, predY)
  253. for i in range(kmeans.n_clusters):
  254.     print "Cluster ", i
  255.     print confusion[:, i]
  256.  
  257.  
  258.  
  259.  
  260. # Imputation
  261. #####################################################################
  262. imputer = preprocessing.Imputer(missing_values=NODATA, strategy='median')
  263. imputer.fit(trainX)
  264. trainX = imputer.transform(trainX)
  265. testX = imputer.transform(testX)
  266.  
  267.  
  268.  
  269.  
  270. # CLASSIFY
  271. ####################################################################
  272. lda = LDA()
  273. lda.fit(trainX, trainY)
  274. predY = lda.predict(testX)
  275.  
  276.  
  277. adaboost = ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)
  278. adaboost.fit(trainX, trainY)
  279. predY = adaboost.predict(testX)
  280.  
  281. #
  282. gradboost = ensemble.GradientBoostingClassifier(random_state=0)
  283. gradboost.fit(trainX, trainY)
  284. predY = gradboost.predict(testX)
  285.  
  286.  
  287. decision_tree = tree.DecisionTreeClassifier(max_depth=1, min_samples_leaf=trainX.shape[0])
  288. decision_tree.fit(trainX, trainY)
  289. decision_tree.predict(testX)
  290. with open("plots/tree.dot", "w") as f:
  291.     tree.export_graphviz(decision_tree, out_file=f)
  292.  
  293.  
  294. # gaussNB was bad - .65 avg FS, .60 HM FS - so, they aren't independant
  295. # With LDA(99 feats), it became much better - 0.80 avg and .77 HM
  296. gauss_bayes = naive_bayes.GaussianNB()
  297. gauss_bayes.fit(trainX, trainY)
  298. predY = gauss_bayes.predict(testX)
  299.  
  300.  
  301. # Ridge is surprisingly slow. Even with LDA(99) it doesnt complete in 15
  302. # mins. It seems it takes up all my RAM and I didnt wait for it to finish
  303. ridge = linear_model.RidgeClassifierCV()
  304. ridge.fit(trainX, trainY)
  305. predY = ridge.predict(testX)
  306.  
  307. # Avg : 0.83 HM=0.81
  308. # With LDA(99) - Avg=0.81 HM=0.79
  309. # train all data, AM=0.91 HM=0.91
  310. # Power transform : Gives zero for most classes
  311. # train all data + power transform, AM=HM=0.99 :oo
  312. # Power transform + mean impute : HM=0.72 AM=0.75
  313. # Power transform + median impute : HM=0.72 AM=0.75
  314. # Power transform + median impute + LDA : HM=0.73 AM=0.76
  315. svm_ = svm.SVC(C=1, kernel='rbf', gamma=0)
  316. svm_.fit(trainX, trainY)
  317. predY = svm_.predict(testX)
  318.  
  319.  
  320.  
  321.  
  322. # Metrics
  323. prfs = metrics.precision_recall_fscore_support(testY, predY, average=None)
  324. print "HM =", stats.hmean(prfs[2]), "\t AM =", np.mean(prfs[2])
  325.  
  326.  
  327. print "DONE"
  328. plt.show()
  329.  
  330.  
  331. # Visualization
  332. # Find explained_variance with LDA ?
  333. # igloo, Mondarin, iPlots
  334.  
  335. # 1. Find LDA variance explained
  336. #  - ICA - what does it do again?
  337. # lda  says .. Variables are collinear
  338.  
  339. # 3. Check chi square. f_classif ? RFE ?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement