Advertisement
Guest User

Untitled

a guest
Nov 20th, 2018
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.26 KB | None | 0 0
  1. '''
  2. Author: Kalina Jasinska
  3. '''
  4. import itertools
  5. import matplotlib.pyplot as plt
  6.  
  7. from classifiers_students.naive_bays import NaiveBayes
  8. from plot_learning_curve import evaluate_accuracy_and_time, evaluate_partial_accuracy_and_time
  9. from sklearn.naive_bayes import GaussianNB
  10. from sklearn.neighbors import KNeighborsClassifier
  11. from sklearn.linear_model import LogisticRegression
  12. from utils.evaluate import scorer_squared_error, scorer_01loss
  13. from utils.load import read_datasets
  14. import time
  15. import numpy as np
  16. # Implement plotting of a learning curve using sklearn
  17. # Remember that the evaluation metrics to plot are 0/1 loss and squared error
  18.  
  19.  
  20. datasets = [('../data/badges2-train.csv', '../data/badges2-test.csv',  "Badges2"),
  21.             ('../data/credit-a-train.csv','../data/credit-a-test.csv', "Credit-a"),
  22.             ('../data/credit-a-mod-train.csv','../data/credit-a-mod-test.csv', "Credit-a-mod"),
  23.             ('../data/spambase-train.csv', '../data/spambase-test.csv', "Spambase"),
  24.             ('../data/covtype-train.csv', '../data/covtype-test.csv', "Covtype")
  25.            ]
  26.  
  27.  
  28. def make_learning_curves():
  29.     raise NotImplementedError
  30.  
  31.  
  32. def evaluate_classifer():
  33.     number_of_subitems = 20
  34.  
  35.     fn, fn_test, ds_name = datasets[2]
  36.     # fn, fn_test, ds_name = '../data/spambase-train.csv', '../data/spambase-test.csv', "Spambase"
  37.     # fn, fn_test, ds_name = '../data/credit-a-mod-train.csv', '../data/credit-a-mod-test.csv', "Credit-a-mod"
  38.     print("Dataset {0}".format(ds_name))
  39.     X_train, y_train, X_test, y_test, is_categorical = read_datasets(fn, fn_test)
  40.     classifier_regression = LogisticRegression()
  41.     classifier_bayes = GaussianNB()
  42.  
  43.     splitted_train_x = np.array_split(X_train, number_of_subitems)
  44.     splitted_train_y = np.array_split(y_train, number_of_subitems)
  45.  
  46.     splitted_train_indices = [i for i in range(0, number_of_subitems)]
  47.  
  48.     all_train_times_regression = []
  49.     all_test_times_regression = []
  50.     all_losses_regression = [[],[],[],[]]
  51.  
  52.     all_train_times_bayes = []
  53.     all_test_times_bayes = []
  54.     all_losses_bayes = [[], [], [], []]
  55.     for i in range(0, number_of_subitems):
  56.         total_test_time_regression = 0
  57.         total_train_time_regression = 0
  58.         total_loss_regression = [0, 0, 0, 0]
  59.  
  60.         total_test_time_bayes = 0
  61.         total_train_time_bayes = 0
  62.         total_loss_bayes = [0, 0, 0, 0]
  63.  
  64.         combinations = list(itertools.combinations(splitted_train_indices, i+1))
  65.         reduced_combinations = min(len(combinations), 10)
  66.         combinations = [combinations[i] for i in range(0, reduced_combinations)]
  67.         print len(combinations)
  68.         for key, combination in enumerate(combinations):
  69.             test_matrix_x = np.asarray([np.asarray([])])
  70.             test_matrix_y = np.asarray([np.asarray([])])
  71.             for key2, item in enumerate(combination):
  72.                 splitted_train_item_x = np.array(splitted_train_x[item])
  73.                 splitted_train_item_y = np.array(splitted_train_y[item])
  74.                 if test_matrix_x.size == 0:
  75.                     test_matrix_x = splitted_train_item_x
  76.                 else:
  77.                     test_matrix_x = np.concatenate((test_matrix_x, splitted_train_item_x))
  78.                 if test_matrix_y.size == 0:
  79.                     test_matrix_y = splitted_train_item_y
  80.                 else:
  81.                     test_matrix_y = np.concatenate((test_matrix_y, splitted_train_item_y))
  82.  
  83.             single_train_time_regression, single_test_time_regression, single_test_01_loss_regression, single_test_square_loss_regression = evaluate_partial_accuracy_and_time(classifier_regression, test_matrix_x, test_matrix_y, X_test, y_test)
  84.             single_train_time_bayes, single_test_time_bayes, single_test_01_loss_bayes, single_test_square_loss_bayes = evaluate_partial_accuracy_and_time(classifier_bayes, test_matrix_x, test_matrix_y, X_test, y_test)
  85.  
  86.             total_test_time_regression += single_test_time_regression
  87.             total_train_time_regression += single_train_time_regression
  88.  
  89.             total_test_time_bayes += single_test_time_bayes
  90.             total_train_time_bayes +=single_train_time_bayes
  91.  
  92.             total_loss_regression[0] += single_test_01_loss_regression[0]
  93.             total_loss_regression[1] += single_test_square_loss_regression[0]
  94.             total_loss_regression[2] += single_test_01_loss_regression[1]
  95.             total_loss_regression[3] += single_test_square_loss_regression[1]
  96.  
  97.             total_loss_bayes[0] += single_test_01_loss_bayes[0]
  98.             total_loss_bayes[1] += single_test_square_loss_bayes[0]
  99.             total_loss_bayes[2] += single_test_01_loss_bayes[1]
  100.             total_loss_bayes[3] += single_test_square_loss_bayes[1]
  101.  
  102.         avg_train_time_regression = total_train_time_regression / len(combinations)
  103.         avg_test_time_regression = total_test_time_regression / len(combinations)
  104.         avg_train_time_bayes = total_train_time_bayes / len(combinations)
  105.         avg_test_time_bayes = total_test_time_bayes / len(combinations)
  106.  
  107.         #train_01_loss, train_square_loss, test_01_loss, test_square_loss
  108.         avg_loss_regression = [total_loss_regression[0], total_loss_regression[1], total_loss_regression[2], total_loss_regression[3]]
  109.         avg_loss_bayes = [total_loss_bayes[0], total_loss_bayes[1], total_loss_bayes[2], total_loss_bayes[3]]
  110.         avg_loss_regression = [i/len(combinations) for i in avg_loss_regression]
  111.         avg_loss_bayes = [i/len(combinations) for i in avg_loss_bayes]
  112.  
  113.         all_train_times_regression.append(avg_train_time_regression)
  114.         all_test_times_regression.append(avg_test_time_regression)
  115.  
  116.         all_losses_regression[0].append(avg_loss_regression[0])
  117.         all_losses_regression[1].append(avg_loss_regression[1])
  118.         all_losses_regression[2].append(avg_loss_regression[2])
  119.         all_losses_regression[3].append(avg_loss_regression[3])
  120.  
  121.         all_train_times_bayes.append(avg_train_time_bayes)
  122.         all_test_times_bayes.append(avg_test_time_bayes)
  123.  
  124.         all_losses_bayes[0].append(avg_loss_bayes[0])
  125.         all_losses_bayes[1].append(avg_loss_bayes[1])
  126.         all_losses_bayes[2].append(avg_loss_bayes[2])
  127.         all_losses_bayes[3].append(avg_loss_bayes[3])
  128.  
  129.      #Create plot from all times
  130.     plt.figure(1)
  131.     plt.subplots_adjust(hspace=0.5)
  132.  
  133.     plt.subplot(221)
  134.     plt.ylabel('Time')
  135.     plt.title('Logistic Regression Loss')
  136.     plt.xlabel('Number of 5% subsets')
  137.     plt.plot(range(0, 20), all_losses_regression[0], 'b', label='0/1 train loss reg')
  138.     plt.plot(range(0, 20), all_losses_regression[1], 'r', label='0/1 test loss reg')
  139.     plt.legend(loc='upper right')
  140.  
  141.     plt.subplot(222)
  142.     plt.ylabel('Time')
  143.     plt.title('Logistic Regression Times')
  144.     plt.xlabel('Number of 5% subsets')
  145.     plt.plot(range(0, 20), all_train_times_regression, 'b', label='train time regression')
  146.     plt.plot(range(0, 20), all_test_times_regression, 'r', label='test time regression')
  147.     plt.legend(loc='upper right')
  148.  
  149.     plt.subplot(223)
  150.     plt.ylabel('Time')
  151.     plt.title('Bays Loss')
  152.     plt.xlabel('Number of 5% subsets')
  153.     plt.plot(range(0, 20), all_losses_bayes[0], 'b', label='0/1 train loss bay')
  154.     plt.plot(range(0, 20), all_losses_bayes[1], 'r', label='0/1 test loss bay')
  155.     plt.legend(loc='upper right')
  156.  
  157.     plt.subplot(224)
  158.     plt.ylabel('Time')
  159.     plt.title('Bays Times')
  160.     plt.xlabel('Number of 5% subsets')
  161.     plt.plot(range(0, 20), all_train_times_bayes, 'b', label='train time bayes')
  162.     plt.plot(range(0, 20), all_test_times_bayes, 'r', label='test time bayes')
  163.     plt.legend(loc='upper right')
  164.  
  165.     plt.show()
  166.  
  167. if __name__ == "__main__":
  168.     evaluate_classifer()
  169.     #make_learning_curves()
  170.  
  171.  
  172.  
  173.  
  174.  
  175.  
  176. import numpy as np
  177. import matplotlib.pyplot as plt
  178. from utils.load import convert_to_onehot
  179. from utils.evaluate import scorer_squared_error, scorer_01loss
  180. from utils.load import read_and_convert_pandas_files
  181. import time
  182.  
  183. def evaluate_accuracy_and_time(classifier, X_train, y_train, X_test, y_test):
  184.     start_time = time.time()
  185.     classifier.fit(X_train, y_train)
  186.     training_time = time.time() - start_time
  187.     print("Training time = {0}".format(training_time))
  188.  
  189.     scorers = [(scorer_01loss, "0/1 loss"), (scorer_squared_error, "squared error")]
  190.     start_time = time.time()
  191.     for scorer, scorer_name in scorers:
  192.         print("Train {0} = {1}".format(scorer_name, scorer(classifier, X_train, y_train)))
  193.         print("Test {0} = {1}".format(scorer_name, scorer(classifier, X_test, y_test)))
  194.     testing_time = time.time() - start_time
  195.     print("Testing time = {0}".format(testing_time))
  196.     print ("--------------------------------------")
  197.  
  198. def evaluate_partial_accuracy_and_time(classifier, X_train, y_train, X_test, y_test):
  199.     start_time = time.time()
  200.     classifier.fit(X_train, y_train)
  201.     training_time = time.time() - start_time
  202.     train_loss = []
  203.     test_loss = []
  204.  
  205.     scorers = [(scorer_01loss, "0/1 loss"), (scorer_squared_error, "squared error")]
  206.     start_time = time.time()
  207.     for scorer, scorer_name in scorers:
  208.         train_loss.append(scorer(classifier, X_train, y_train))
  209.         test_loss.append(scorer(classifier, X_test, y_test))
  210.     testing_time = time.time() - start_time
  211.     return (training_time, testing_time, train_loss, test_loss)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement