Advertisement
Guest User

Untitled

a guest
Oct 20th, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.62 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import numpy as np
  3. import pandas as pd
  4.  
  5. from common import describe_data, test_env
  6.  
  7. from sklearn.tree import DecisionTreeClassifier
  8. from sklearn.metrics import confusion_matrix
  9. from sklearn.linear_model import LogisticRegression
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.metrics import accuracy_score
  12. from sklearn.neighbors import KNeighborsClassifier
  13. from sklearn.svm import SVC
  14. from sklearn.naive_bayes import MultinomialNB
  15. from sklearn.ensemble import RandomForestClassifier
  16.  
  17.  
  18. def read_data(file):
  19.     """Return pandas dataFrame read from csv file"""
  20.     try:
  21.         return pd.read_excel(file)
  22.     except FileNotFoundError:
  23.         exit('ERROR: ' + file + ' not found')
  24.  
  25.  
  26. def preprocess_data(df, verbose=False):
  27.     y_column = 'In university after 4 semesters'
  28.  
  29.     # Features can be excluded by adding column name to list
  30.     drop_columns = []
  31.  
  32.     categorical_columns = [
  33.         'Faculty',
  34.         'Paid tuition',
  35.         'Study load',
  36.         'Previous school level',
  37.         'Previous school study language',
  38.         'Recognition',
  39.         'Study language',
  40.         'Foreign student'
  41.     ]
  42.  
  43.     # Handle dependent variable
  44.     if verbose:
  45.         print('Missing y values: ', df[y_column].isna().sum())
  46.  
  47.     y = df[y_column].values
  48.     # Encode y. Naive solution
  49.     y = np.where(y == 'No', 0, y)
  50.     y = np.where(y == 'Yes', 1, y)
  51.     y = y.astype(float)
  52.  
  53.     # Drop also dependent variable variable column to leave only features
  54.     drop_columns.append(y_column)
  55.     df = df.drop(labels=drop_columns, axis=1)
  56.  
  57.     # Remove drop columns for categorical columns just in case
  58.     categorical_columns = [
  59.         i for i in categorical_columns if i not in drop_columns]
  60.  
  61.     # STUDENT SHALL ENCODE CATEGORICAL FEATURES
  62.     # fillna - NA/NaN values using the specified method.
  63.     for i in categorical_columns:
  64.         df[i] = df[i].fillna(value='Missing')
  65.  
  66.     dum = pd.get_dummies(df, prefix_sep=":", columns=categorical_columns)
  67.     df = dum.drop(
  68.         columns=[
  69.             'Paid tuition:No',
  70.             'Foreign student:No',
  71.             'Previous school study language:Not known',
  72.             'Study load:Partial',
  73.             'Faculty:School of Engineering',
  74.             'Recognition:Missing',
  75.             'Study language:Estonian'])
  76.     df = df.fillna(value=0)
  77.  
  78.     # Handle missing data. At this point only exam points should be missing
  79.     # It seems to be easier to fill whole data frame as only particular columns
  80.     if verbose:
  81.         describe_data.print_nan_counts(df)
  82.  
  83.     # STUDENT SHALL HANDLE MISSING VALUES
  84.  
  85.     if verbose:
  86.         describe_data.print_nan_counts(df)
  87.  
  88.     # Return features data frame and dependent variable
  89.     return df, y
  90.  
  91.  
  92. # STUDENT SHALL CREATE FUNCTIONS FOR LOGISTIC REGRESSION CLASSIFIER, KNN
  93. # CLASSIFIER, SVM CLASSIFIER, NAIVE BAYES CLASSIFIER, DECISION TREE
  94. # CLASSIFIER AND RANDOM FOREST CLASSIFIER
  95.  
  96. def random_forest_classifier(train_X, train_y, test_X, test_y):
  97.     rnd_clf = RandomForestClassifier(n_estimators=100)
  98.     rnd_clf.fit(train_X, train_y)
  99.     pred_y = rnd_clf.predict(test_X)
  100.     matrix = confusion_matrix(test_y, pred_y)
  101.     accuracy = accuracy_score(test_y, pred_y)
  102.     print("\n# Random forest classifier test data")
  103.     print_confusion_matrix(matrix, accuracy)
  104.  
  105.  
  106. def decision_tree_classifier(train_X, train_y, test_X, test_y):
  107.     tree_clf = DecisionTreeClassifier()
  108.     tree_clf.fit(train_X, train_y)
  109.     pred_y = tree_clf.predict(test_X)
  110.     matrix = confusion_matrix(test_y, pred_y)
  111.     accuracy = accuracy_score(test_y, pred_y)
  112.     print("\n# Decision tree classifier test data")
  113.     print_confusion_matrix(matrix, accuracy)
  114.  
  115.  
  116. def naive_bayes_classifier(train_X, train_y, test_X, test_y):
  117.     naive_clf = MultinomialNB()
  118.     naive_clf.fit(train_X, train_y)
  119.     pred_y = naive_clf.predict(test_X)
  120.     matrix = confusion_matrix(test_y, pred_y)
  121.     accuracy = accuracy_score(test_y, pred_y)
  122.     print("\n# Naive bayes test data")
  123.     print_confusion_matrix(matrix, accuracy)
  124.  
  125.  
  126. def svm_classifier(train_X, train_y, test_X, test_y):
  127.     svm_clf = SVC(kernel='sigmoid', random_state=0,
  128.                   gamma=.01, C=1, probability=True)
  129.     svm_clf.fit(train_X, train_y)
  130.     pred_y = svm_clf.predict(test_X)
  131.     matrix = confusion_matrix(test_y, pred_y)
  132.     accuracy = accuracy_score(test_y, pred_y)
  133.     print("\n# SVC regression test data")
  134.     print_confusion_matrix(matrix, accuracy)
  135.  
  136.  
  137. def print_confusion_matrix(matrix, accuracy):
  138.     print("\nConfusion matrix: \n     pred:No pred:yes")
  139.  
  140.     print("true:No   " + str(matrix[0][0]) + "  " + str(matrix[0][1]))
  141.     print("true:Yes  " + str(matrix[1][0]) + "  " + str(matrix[1][1]))
  142.  
  143.     print("\nAccuracy: " + str(accuracy * 100) + "%")
  144.  
  145.  
  146. def logistic_regression_classifier(train_X, train_y, test_X, test_y):
  147.     logreg_clf = LogisticRegression(
  148.         solver='newton-cg',
  149.         multi_class='multinomial')
  150.     logreg_clf.fit(train_X, train_y)
  151.  
  152.     pred_y = logreg_clf.predict(test_X)
  153.     matrix = confusion_matrix(test_y, pred_y)
  154.     print("\n# Logistic regression test data")
  155.     accuracy = accuracy_score(test_y, pred_y)
  156.     print_confusion_matrix(matrix, accuracy)
  157.  
  158.  
  159. def knn_classifier(train_X, train_y, test_X, test_y):
  160.     knn_clf = KNeighborsClassifier(n_neighbors=5)
  161.     knn_clf.fit(train_X, train_y)
  162.     pred_y = knn_clf.predict(test_X)
  163.  
  164.     print("\n# KNN test data")
  165.     matrix = confusion_matrix(test_y, pred_y)
  166.     accuracy = accuracy_score(test_y, pred_y)
  167.     print_confusion_matrix(matrix, accuracy)
  168.  
  169.  
  170. if __name__ == '__main__':
  171.     modules = ['numpy', 'pandas', 'sklearn']
  172.     test_env.versions(modules)
  173.  
  174.     students = read_data('data/students.xlsx')
  175.  
  176.     # STUDENT SHALL CALL PRINT_OVERVIEW AND PRINT_CATEGORICAL FUNCTIONS WITH
  177.     # FILE NAME AS ARGUMENT
  178.     describe_data.print_overview(
  179.         students, file='results/students_overview.txt')
  180.     describe_data.print_categorical(
  181.         students, file='results/students_categorical_data.txt')
  182.  
  183.     students_X, students_y = preprocess_data(students)
  184.  
  185.     # data for methods
  186.     # test size 0.25 as default will be used.
  187.  
  188.     train_X, test_X, train_y, test_y = train_test_split(
  189.         students_X, students_y, test_size=None, train_size=None, random_state=60)
  190.  
  191.     # STUDENT SHALL CALL CREATED CLASSIFIERS FUNCTIONS
  192.     logistic_regression_classifier(train_X, train_y, test_X, test_y)
  193.     knn_classifier(train_X, train_y, test_X, test_y)
  194.     svm_classifier(train_X, train_y, test_X, test_y)
  195.     naive_bayes_classifier(train_X, train_y, test_X, test_y)
  196.     decision_tree_classifier(train_X, train_y, test_X, test_y)
  197.  
  198.     print('Done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement