Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os, sys
- from pprint import pprint
- from collections import Counter
- # Change order of sys.path so that pip packages are given higher priority than apt-get
- # This uses the newer sklearn version
- if ('/usr/local/lib/python2.7/dist-packages' in sys.path
- and (sys.path.index('/usr/local/lib/python2.7/dist-packages') >
- sys.path.index('/usr/lib/python2.7/dist-packages'))):
- sys.path.remove('/usr/local/lib/python2.7/dist-packages')
- sys.path.insert(1, '/usr/local/lib/python2.7/dist-packages')
- import numpy as np
- from scipy import stats
- import matplotlib.pyplot as plt
- import sklearn
- import sklearn.cross_validation as cross_validation
- from sklearn.lda import LDA
- import sklearn.decomposition as decomposition
- import sklearn.metrics as metrics
- import sklearn.ensemble as ensemble
- import sklearn.feature_selection as feature_selection
- import sklearn.linear_model as linear_model
- import sklearn.cluster as cluster
- import sklearn.multiclass as multiclass
- import sklearn.svm as svm
- import sklearn.tree as tree
- import sklearn.naive_bayes as naive_bayes
- import sklearn.preprocessing as preprocessing
- def fullprint(*args, **kwargs):
- opt = np.get_printoptions()
- np.set_printoptions(threshold='nan')
- pprint(*args, **kwargs)
- np.set_printoptions(**opt)
- if os.path.exists("train.npz"):
- npzfile = np.load("train.npz")
- trainX = npzfile['trainX']
- trainY = npzfile['trainY']
- else:
- trainX = np.genfromtxt("train_X.csv", delimiter=",")
- trainY = np.genfromtxt("train_Y.csv", delimiter=",")
- np.savez("train.npz", trainX=trainX, trainY=trainY)
- # Get test data
- if os.path.exists("test.npz"):
- npzfile = np.load("test.npz")
- testX = npzfile['testX']
- else:
- testX = np.genfromtxt("test_X.csv", delimiter=",")
- np.savez("test.npz", testX=testX)
- dataX = trainX.copy()
- dataY = trainY.copy()
- data_testX = testX.copy()
- NODATA = 0
- classes = np.unique(trainY)
- num_feats = trainX.shape[1]
- num_class = classes.size
- assert num_feats == testX.shape[1]
- # Some drawings/plots to understand the data
- ####################################################################
- # print "Plotting class distribution (histogram)"
- plt.hist(trainY, bins=num_class)
- # Check gaussian
- stats.mstats.normaltest(trainX[1, :] != NODATA)
- # print "Plotting 2D lda features for data"
- lda = LDA()
- lda.fit(trainX, trainY)
- lda_trainX = lda.transform(trainX)
- pca = decomposition.PCA()
- pca.fit(trainX, trainY)
- pca_trainX = pca.transform(trainX)
- pca_var = pca.explained_variance_ / pca.explained_variance_.sum()
- # plt.plot(pca_var.cumsum())
- reduced_trainX = lda_trainX
- plt.clf()
- plt.scatter(reduced_trainX[:, 0], reduced_trainX[:, 1], alpha=0.3)
- plt.savefig("plots/data.svg")
- # Find means
- class_centroids = np.zeros((num_class, reduced_trainX.shape[1]))
- for i in classes:
- class_ind, = np.where(trainY == i)
- classY = trainY[class_ind]
- classX = reduced_trainX[class_ind, :]
- plt.clf()
- plt.scatter(classX[:, 0], classX[:, 1], alpha=1)
- plt.savefig("plots/class_" + str(i) + ".svg")
- class_centroid = classX.sum(axis=0) * 1.0 / class_ind.size
- class_centroids[i, :] = class_centroid
- plt.scatter(class_centroids[:, 0], class_centroids[:, 1], alpha=1)
- plt.savefig("plots/class_avg.svg")
- for i in range(num_class):
- plt.clf()
- class_ind, = np.where(trainY == i)
- classY = trainY[class_ind]
- classX = reduced_trainX[class_ind, :]
- plt.scatter(classX[:, 0], classX[:, 1], alpha=1)
- plt.savefig("plots/class_" + str(i) + ".svg")
- # Check which features are constant for which class
- inds = []
- indicator_zero = np.zeros((num_class, num_feats))
- for i in classes:
- class_ind, = np.where(trainY == i)
- classY = trainY[class_ind]
- classX = trainX[class_ind, :]
- zero_ind, = np.where(classX.std(axis=0) < 1E-35)
- # print i, "-", zero_ind
- print i, "-", zero_ind.size
- inds += zero_ind.tolist()
- indicator_zero[i, zero_ind] = 1
- len(inds)
- # Split to test/train
- ###################################################################
- print "Stratified K Fold"
- skf = cross_validation.StratifiedShuffleSplit(
- trainY, n_iter=1, test_size=0.2)
- # Currently just using 1 kfold, later can use more.
- skf = list(skf)
- train_index, test_index = skf[0]
- trainX, testX = trainX[train_index, :], trainX[test_index, :]
- trainY, testY = trainY[train_index], trainY[test_index]
- # Preprocessing
- ####################################################################
- # Finding feats which are constant - this wont really matter if LDA is done
- # - 267 diff features are optional in some classes (tot 2074)
- # - the number of const features ranges from 4-60 for each class
- # - feature 1948 is not important for all 100 classes
- # - 1948: 100, 1980: 96, 1971: 93, 2042: 91, 1254: 74
- # - 1464 is the most freqeuntly occuring feature (13428 dp)
- # all_zero_feats, = np.where( trainX.std(axis=0) < 1e-5 )
- # all_nzero_feats = np.array(list(set(range(num_feats)) - set(all_zero_feats)))
- # trainX = trainX[:, all_nzero_feats]
- # testX = testX[:, all_nzero_feats]
- # print "Removing feats", all_zero_feats
- # "Box coxxing"
- box_lambda = 0.1
- NODATA = ( 0 ** box_lambda - 1 ) / box_lambda
- trainX = ( np.power(trainX, box_lambda) - 1 ) / box_lambda
- testX = ( np.power(testX, box_lambda) - 1 ) / box_lambda
- # Transform to LDA space
- lda = LDA()
- lda.fit(trainX, trainY)
- trainX = lda.transform(trainX)
- testX = lda.transform(testX)
- # Max of percentage of zeros
- percentage_zero = np.zeros((num_class, num_feats))
- for i in classes:
- class_ind, = np.where(trainY == i)
- classY = trainY[class_ind]
- classX = trainX[class_ind, :]
- percent_zero = (classX == 0).sum(axis=0) * 1.0 / class_ind.size
- percentage_zero[i, :] = percent_zero
- classes = np.unique(trainY)
- num_feats = trainX.shape[1]
- num_class = classes.size
- # CLUSTER
- ####################################################################
- # KMeans - quite slow
- kmeans = cluster.KMeans(n_clusters=100, max_iter=100, n_init=10, random_state=0)
- kmeans.fit(trainX)
- predY = kmeans.predict(testX)
- # Find the dist between clusters to gauge the separatability
- means = kmeans.cluster_centers_
- cluster_dist = np.zeros((means.shape[0], means.shape[0]))
- for i in xrange(means.shape[0]):
- for j in xrange(i, means.shape[0]):
- cluster_dist[i, j] = cluster_dist[j, i] = \
- np.sqrt(np.square(means[i, :] - means[j, :]).sum())
- # Birch - Never completed. takes too long
- birch = cluster.Birch(n_clusters=2)
- birch.fit(trainX)
- predY = birch.predict(testX)
- # DBScan
- # This doesnt make sense, as in dbscan we cannot train and then test, because
- # the clusters don't have a definition. Also, each test/train will have diff
- # numbers for min_samples and epsilon
- dbscan = cluster.DBSCAN(eps=1, min_samples=4)
- # dbscan.fit(trainX) # This canot be used to predict later.
- predY = dbscan.fit_predict(testX)
- # Spectral - Very slow, eats up crazy RAM to make the graph
- # What is Spectral CoClustering ?
- spectral = cluster.SpectralClustering(n_clusters=2)
- spectral.fit(trainX)
- predY = spectral.predict(testX)
- confusion = metrics.confusion_matrix(testY, predY)
- for i in range(kmeans.n_clusters):
- print "Cluster ", i
- print confusion[:, i]
- # Imputation
- #####################################################################
- imputer = preprocessing.Imputer(missing_values=NODATA, strategy='median')
- imputer.fit(trainX)
- trainX = imputer.transform(trainX)
- testX = imputer.transform(testX)
- # CLASSIFY
- ####################################################################
- lda = LDA()
- lda.fit(trainX, trainY)
- predY = lda.predict(testX)
- adaboost = ensemble.AdaBoostClassifier(n_estimators=50, random_state=0)
- adaboost.fit(trainX, trainY)
- predY = adaboost.predict(testX)
- #
- gradboost = ensemble.GradientBoostingClassifier(random_state=0)
- gradboost.fit(trainX, trainY)
- predY = gradboost.predict(testX)
- decision_tree = tree.DecisionTreeClassifier(max_depth=1, min_samples_leaf=trainX.shape[0])
- decision_tree.fit(trainX, trainY)
- decision_tree.predict(testX)
- with open("plots/tree.dot", "w") as f:
- tree.export_graphviz(decision_tree, out_file=f)
- # gaussNB was bad - .65 avg FS, .60 HM FS - so, they aren't independant
- # With LDA(99 feats), it became much better - 0.80 avg and .77 HM
- gauss_bayes = naive_bayes.GaussianNB()
- gauss_bayes.fit(trainX, trainY)
- predY = gauss_bayes.predict(testX)
- # Ridge is surprisingly slow. Even with LDA(99) it doesnt complete in 15
- # mins. It seems it takes up all my RAM and I didnt wait for it to finish
- ridge = linear_model.RidgeClassifierCV()
- ridge.fit(trainX, trainY)
- predY = ridge.predict(testX)
- # Avg : 0.83 HM=0.81
- # With LDA(99) - Avg=0.81 HM=0.79
- # train all data, AM=0.91 HM=0.91
- # Power transform : Gives zero for most classes
- # train all data + power transform, AM=HM=0.99 :oo
- # Power transform + mean impute : HM=0.72 AM=0.75
- # Power transform + median impute : HM=0.72 AM=0.75
- # Power transform + median impute + LDA : HM=0.73 AM=0.76
- svm_ = svm.SVC(C=1, kernel='rbf', gamma=0)
- svm_.fit(trainX, trainY)
- predY = svm_.predict(testX)
- # Metrics
- prfs = metrics.precision_recall_fscore_support(testY, predY, average=None)
- print "HM =", stats.hmean(prfs[2]), "\t AM =", np.mean(prfs[2])
- print "DONE"
- plt.show()
- # Visualization
- # Find explained_variance with LDA ?
- # igloo, Mondarin, iPlots
- # 1. Find LDA variance explained
- # - ICA - what does it do again?
- # lda says .. Variables are collinear
- # 3. Check chi square. f_classif ? RFE ?
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement