Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- Author: Kalina Jasinska
- '''
- import itertools
- import matplotlib.pyplot as plt
- from classifiers_students.naive_bays import NaiveBayes
- from plot_learning_curve import evaluate_accuracy_and_time, evaluate_partial_accuracy_and_time
- from sklearn.naive_bayes import GaussianNB
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.linear_model import LogisticRegression
- from utils.evaluate import scorer_squared_error, scorer_01loss
- from utils.load import read_datasets
- import time
- import numpy as np
- # Implement plotting of a learning curve using sklearn
- # Remember that the evaluation metrics to plot are 0/1 loss and squared error
- datasets = [('../data/badges2-train.csv', '../data/badges2-test.csv', "Badges2"),
- ('../data/credit-a-train.csv','../data/credit-a-test.csv', "Credit-a"),
- ('../data/credit-a-mod-train.csv','../data/credit-a-mod-test.csv', "Credit-a-mod"),
- ('../data/spambase-train.csv', '../data/spambase-test.csv', "Spambase"),
- ('../data/covtype-train.csv', '../data/covtype-test.csv', "Covtype")
- ]
- def make_learning_curves():
- raise NotImplementedError
- def evaluate_classifer():
- number_of_subitems = 20
- fn, fn_test, ds_name = datasets[2]
- # fn, fn_test, ds_name = '../data/spambase-train.csv', '../data/spambase-test.csv', "Spambase"
- # fn, fn_test, ds_name = '../data/credit-a-mod-train.csv', '../data/credit-a-mod-test.csv', "Credit-a-mod"
- print("Dataset {0}".format(ds_name))
- X_train, y_train, X_test, y_test, is_categorical = read_datasets(fn, fn_test)
- classifier_regression = LogisticRegression()
- classifier_bayes = GaussianNB()
- splitted_train_x = np.array_split(X_train, number_of_subitems)
- splitted_train_y = np.array_split(y_train, number_of_subitems)
- splitted_train_indices = [i for i in range(0, number_of_subitems)]
- all_train_times_regression = []
- all_test_times_regression = []
- all_losses_regression = [[],[],[],[]]
- all_train_times_bayes = []
- all_test_times_bayes = []
- all_losses_bayes = [[], [], [], []]
- for i in range(0, number_of_subitems):
- total_test_time_regression = 0
- total_train_time_regression = 0
- total_loss_regression = [0, 0, 0, 0]
- total_test_time_bayes = 0
- total_train_time_bayes = 0
- total_loss_bayes = [0, 0, 0, 0]
- combinations = list(itertools.combinations(splitted_train_indices, i+1))
- reduced_combinations = min(len(combinations), 10)
- combinations = [combinations[i] for i in range(0, reduced_combinations)]
- print len(combinations)
- for key, combination in enumerate(combinations):
- test_matrix_x = np.asarray([np.asarray([])])
- test_matrix_y = np.asarray([np.asarray([])])
- for key2, item in enumerate(combination):
- splitted_train_item_x = np.array(splitted_train_x[item])
- splitted_train_item_y = np.array(splitted_train_y[item])
- if test_matrix_x.size == 0:
- test_matrix_x = splitted_train_item_x
- else:
- test_matrix_x = np.concatenate((test_matrix_x, splitted_train_item_x))
- if test_matrix_y.size == 0:
- test_matrix_y = splitted_train_item_y
- else:
- test_matrix_y = np.concatenate((test_matrix_y, splitted_train_item_y))
- single_train_time_regression, single_test_time_regression, single_test_01_loss_regression, single_test_square_loss_regression = evaluate_partial_accuracy_and_time(classifier_regression, test_matrix_x, test_matrix_y, X_test, y_test)
- single_train_time_bayes, single_test_time_bayes, single_test_01_loss_bayes, single_test_square_loss_bayes = evaluate_partial_accuracy_and_time(classifier_bayes, test_matrix_x, test_matrix_y, X_test, y_test)
- total_test_time_regression += single_test_time_regression
- total_train_time_regression += single_train_time_regression
- total_test_time_bayes += single_test_time_bayes
- total_train_time_bayes +=single_train_time_bayes
- total_loss_regression[0] += single_test_01_loss_regression[0]
- total_loss_regression[1] += single_test_square_loss_regression[0]
- total_loss_regression[2] += single_test_01_loss_regression[1]
- total_loss_regression[3] += single_test_square_loss_regression[1]
- total_loss_bayes[0] += single_test_01_loss_bayes[0]
- total_loss_bayes[1] += single_test_square_loss_bayes[0]
- total_loss_bayes[2] += single_test_01_loss_bayes[1]
- total_loss_bayes[3] += single_test_square_loss_bayes[1]
- avg_train_time_regression = total_train_time_regression / len(combinations)
- avg_test_time_regression = total_test_time_regression / len(combinations)
- avg_train_time_bayes = total_train_time_bayes / len(combinations)
- avg_test_time_bayes = total_test_time_bayes / len(combinations)
- #train_01_loss, train_square_loss, test_01_loss, test_square_loss
- avg_loss_regression = [total_loss_regression[0], total_loss_regression[1], total_loss_regression[2], total_loss_regression[3]]
- avg_loss_bayes = [total_loss_bayes[0], total_loss_bayes[1], total_loss_bayes[2], total_loss_bayes[3]]
- avg_loss_regression = [i/len(combinations) for i in avg_loss_regression]
- avg_loss_bayes = [i/len(combinations) for i in avg_loss_bayes]
- all_train_times_regression.append(avg_train_time_regression)
- all_test_times_regression.append(avg_test_time_regression)
- all_losses_regression[0].append(avg_loss_regression[0])
- all_losses_regression[1].append(avg_loss_regression[1])
- all_losses_regression[2].append(avg_loss_regression[2])
- all_losses_regression[3].append(avg_loss_regression[3])
- all_train_times_bayes.append(avg_train_time_bayes)
- all_test_times_bayes.append(avg_test_time_bayes)
- all_losses_bayes[0].append(avg_loss_bayes[0])
- all_losses_bayes[1].append(avg_loss_bayes[1])
- all_losses_bayes[2].append(avg_loss_bayes[2])
- all_losses_bayes[3].append(avg_loss_bayes[3])
- #Create plot from all times
- plt.figure(1)
- plt.subplots_adjust(hspace=0.5)
- plt.subplot(221)
- plt.ylabel('Time')
- plt.title('Logistic Regression Loss')
- plt.xlabel('Number of 5% subsets')
- plt.plot(range(0, 20), all_losses_regression[0], 'b', label='0/1 train loss reg')
- plt.plot(range(0, 20), all_losses_regression[1], 'r', label='0/1 test loss reg')
- plt.legend(loc='upper right')
- plt.subplot(222)
- plt.ylabel('Time')
- plt.title('Logistic Regression Times')
- plt.xlabel('Number of 5% subsets')
- plt.plot(range(0, 20), all_train_times_regression, 'b', label='train time regression')
- plt.plot(range(0, 20), all_test_times_regression, 'r', label='test time regression')
- plt.legend(loc='upper right')
- plt.subplot(223)
- plt.ylabel('Time')
- plt.title('Bays Loss')
- plt.xlabel('Number of 5% subsets')
- plt.plot(range(0, 20), all_losses_bayes[0], 'b', label='0/1 train loss bay')
- plt.plot(range(0, 20), all_losses_bayes[1], 'r', label='0/1 test loss bay')
- plt.legend(loc='upper right')
- plt.subplot(224)
- plt.ylabel('Time')
- plt.title('Bays Times')
- plt.xlabel('Number of 5% subsets')
- plt.plot(range(0, 20), all_train_times_bayes, 'b', label='train time bayes')
- plt.plot(range(0, 20), all_test_times_bayes, 'r', label='test time bayes')
- plt.legend(loc='upper right')
- plt.show()
- if __name__ == "__main__":
- evaluate_classifer()
- #make_learning_curves()
- import numpy as np
- import matplotlib.pyplot as plt
- from utils.load import convert_to_onehot
- from utils.evaluate import scorer_squared_error, scorer_01loss
- from utils.load import read_and_convert_pandas_files
- import time
- def evaluate_accuracy_and_time(classifier, X_train, y_train, X_test, y_test):
- start_time = time.time()
- classifier.fit(X_train, y_train)
- training_time = time.time() - start_time
- print("Training time = {0}".format(training_time))
- scorers = [(scorer_01loss, "0/1 loss"), (scorer_squared_error, "squared error")]
- start_time = time.time()
- for scorer, scorer_name in scorers:
- print("Train {0} = {1}".format(scorer_name, scorer(classifier, X_train, y_train)))
- print("Test {0} = {1}".format(scorer_name, scorer(classifier, X_test, y_test)))
- testing_time = time.time() - start_time
- print("Testing time = {0}".format(testing_time))
- print ("--------------------------------------")
- def evaluate_partial_accuracy_and_time(classifier, X_train, y_train, X_test, y_test):
- start_time = time.time()
- classifier.fit(X_train, y_train)
- training_time = time.time() - start_time
- train_loss = []
- test_loss = []
- scorers = [(scorer_01loss, "0/1 loss"), (scorer_squared_error, "squared error")]
- start_time = time.time()
- for scorer, scorer_name in scorers:
- train_loss.append(scorer(classifier, X_train, y_train))
- test_loss.append(scorer(classifier, X_test, y_test))
- testing_time = time.time() - start_time
- return (training_time, testing_time, train_loss, test_loss)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement