Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- import numpy as np
- import sklearn.cross_validation
- import sklearn.neighbors
- import matplotlib.pyplot as plt
- import matplotlib.patches as mpatches
- import sklearn.feature_selection
- lib_path = os.path.join(os.path.dirname(__file__),"02450Toolbox_Python","Tools")
- sys.path.append(lib_path)
- import toolbox_02450
- # -*- coding: utf-8 -*-
- X = np.mat(np.genfromtxt(os.path.join(os.path.dirname(__file__),"spam.data")))
- y = X[:,57]
- class_names = ["No Spam", "Spam"]
- X = X[:,:57]
- N = len(X)
- k = 5
- kf = sklearn.cross_validation.KFold(N, 5, shuffle=True, random_state=0)
- errors = np.zeros(41)
- errors2 = np.zeros(41)
- errors3 = np.zeros(41)
- for k in range(0,40):
- for train_index, test_index in kf:
- error_sum = 0
- error2_sum = 0
- error3_sum = 0
- X_train, X_test = X[train_index], X[test_index]
- y_train, y_test = y[train_index], y[test_index]
- internal_cross_validation = 10
- neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k+1, metric="euclidean")
- neigh.fit(X_train, y_train.A.ravel())
- y_pred = neigh.predict(X_test)
- y_true = y_test
- error_sum += np.sum(y_pred.ravel()!=y_test.ravel(),dtype=float)/y_test.shape[0]
- neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k+1, metric="euclidean")
- kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, 20)
- kbest.fit(X_train, y_train.A.ravel())
- X_train2 = kbest.transform(X_train)
- X_test2 = kbest.transform(X_test)
- neigh.fit(X_train2, y_train.A.ravel())
- y_pred = neigh.predict(X_test2)
- y_true = y_test
- error2_sum += np.sum(y_pred.ravel()!=y_test.ravel(),dtype=float)/y_test.shape[0]
- neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k+1, metric="euclidean")
- kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.chi2, 40)
- kbest.fit(X_train, y_train.A.ravel())
- X_train3 = kbest.transform(X_train)
- X_test3 = kbest.transform(X_test)
- neigh.fit(X_train3, y_train.A.ravel())
- y_pred = neigh.predict(X_test3)
- y_true = y_test
- error3_sum += np.sum(y_pred.ravel()!=y_test.ravel(),dtype=float)/y_test.shape[0]
- errors[k+1] = error_sum/len(kf)
- errors2[k+1] = error2_sum/len(kf)
- errors3[k+1] = error3_sum/len(kf)
- errors = errors[1::]
- errors2 = errors2[1::]
- errors3 = errors3[1::]
- plt.xlim(1,40)
- plt.plot(range(1,41),errors*100,color="blue", label="All features")
- plt.plot(range(1,41),errors2*100,color="red", label="20 best, chi2")
- plt.plot(range(1,41),errors3*100,color="green", label="40 best, chi2")
- plt.legend()
- plt.xlabel("k neighbors")
- plt.ylabel("mean classification error rate %")
- plt.title("K-Neighbors (euclidean)")
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement