Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import numpy as np
- import pandas as pd
- from common import describe_data, test_env
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.metrics import confusion_matrix
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import SVC
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.ensemble import RandomForestClassifier
- def read_data(file):
- """Return pandas dataFrame read from csv file"""
- try:
- return pd.read_excel(file)
- except FileNotFoundError:
- exit('ERROR: ' + file + ' not found')
- def preprocess_data(df, verbose=False):
- y_column = 'In university after 4 semesters'
- # Features can be excluded by adding column name to list
- drop_columns = []
- categorical_columns = [
- 'Faculty',
- 'Paid tuition',
- 'Study load',
- 'Previous school level',
- 'Previous school study language',
- 'Recognition',
- 'Study language',
- 'Foreign student'
- ]
- # Handle dependent variable
- if verbose:
- print('Missing y values: ', df[y_column].isna().sum())
- y = df[y_column].values
- # Encode y. Naive solution
- y = np.where(y == 'No', 0, y)
- y = np.where(y == 'Yes', 1, y)
- y = y.astype(float)
- # Drop also dependent variable variable column to leave only features
- drop_columns.append(y_column)
- df = df.drop(labels=drop_columns, axis=1)
- # Remove drop columns for categorical columns just in case
- categorical_columns = [
- i for i in categorical_columns if i not in drop_columns]
- # STUDENT SHALL ENCODE CATEGORICAL FEATURES
- # fillna - NA/NaN values using the specified method.
- for i in categorical_columns:
- df[i] = df[i].fillna(value='Missing')
- dum = pd.get_dummies(df, prefix_sep=":", columns=categorical_columns)
- df = dum.drop(
- columns=[
- 'Paid tuition:No',
- 'Foreign student:No',
- 'Previous school study language:Not known',
- 'Study load:Partial',
- 'Faculty:School of Engineering',
- 'Recognition:Missing',
- 'Study language:Estonian'])
- df = df.fillna(value=0)
- # Handle missing data. At this point only exam points should be missing
- # It seems to be easier to fill whole data frame as only particular columns
- if verbose:
- describe_data.print_nan_counts(df)
- # STUDENT SHALL HANDLE MISSING VALUES
- if verbose:
- describe_data.print_nan_counts(df)
- # Return features data frame and dependent variable
- return df, y
- # STUDENT SHALL CREATE FUNCTIONS FOR LOGISTIC REGRESSION CLASSIFIER, KNN
- # CLASSIFIER, SVM CLASSIFIER, NAIVE BAYES CLASSIFIER, DECISION TREE
- # CLASSIFIER AND RANDOM FOREST CLASSIFIER
- def random_forest_classifier(train_X, train_y, test_X, test_y):
- rnd_clf = RandomForestClassifier(n_estimators=100)
- rnd_clf.fit(train_X, train_y)
- pred_y = rnd_clf.predict(test_X)
- matrix = confusion_matrix(test_y, pred_y)
- accuracy = accuracy_score(test_y, pred_y)
- print("\n# Random forest classifier test data")
- print_confusion_matrix(matrix, accuracy)
- def decision_tree_classifier(train_X, train_y, test_X, test_y):
- tree_clf = DecisionTreeClassifier()
- tree_clf.fit(train_X, train_y)
- pred_y = tree_clf.predict(test_X)
- matrix = confusion_matrix(test_y, pred_y)
- accuracy = accuracy_score(test_y, pred_y)
- print("\n# Decision tree classifier test data")
- print_confusion_matrix(matrix, accuracy)
- def naive_bayes_classifier(train_X, train_y, test_X, test_y):
- naive_clf = MultinomialNB()
- naive_clf.fit(train_X, train_y)
- pred_y = naive_clf.predict(test_X)
- matrix = confusion_matrix(test_y, pred_y)
- accuracy = accuracy_score(test_y, pred_y)
- print("\n# Naive bayes test data")
- print_confusion_matrix(matrix, accuracy)
- def svm_classifier(train_X, train_y, test_X, test_y):
- svm_clf = SVC(kernel='sigmoid', random_state=0,
- gamma=.01, C=1, probability=True)
- svm_clf.fit(train_X, train_y)
- pred_y = svm_clf.predict(test_X)
- matrix = confusion_matrix(test_y, pred_y)
- accuracy = accuracy_score(test_y, pred_y)
- print("\n# SVC regression test data")
- print_confusion_matrix(matrix, accuracy)
- def print_confusion_matrix(matrix, accuracy):
- print("\nConfusion matrix: \n pred:No pred:yes")
- print("true:No " + str(matrix[0][0]) + " " + str(matrix[0][1]))
- print("true:Yes " + str(matrix[1][0]) + " " + str(matrix[1][1]))
- print("\nAccuracy: " + str(accuracy * 100) + "%")
- def logistic_regression_classifier(train_X, train_y, test_X, test_y):
- logreg_clf = LogisticRegression(
- solver='newton-cg',
- multi_class='multinomial')
- logreg_clf.fit(train_X, train_y)
- pred_y = logreg_clf.predict(test_X)
- matrix = confusion_matrix(test_y, pred_y)
- print("\n# Logistic regression test data")
- accuracy = accuracy_score(test_y, pred_y)
- print_confusion_matrix(matrix, accuracy)
- def knn_classifier(train_X, train_y, test_X, test_y):
- knn_clf = KNeighborsClassifier(n_neighbors=5)
- knn_clf.fit(train_X, train_y)
- pred_y = knn_clf.predict(test_X)
- print("\n# KNN test data")
- matrix = confusion_matrix(test_y, pred_y)
- accuracy = accuracy_score(test_y, pred_y)
- print_confusion_matrix(matrix, accuracy)
- if __name__ == '__main__':
- modules = ['numpy', 'pandas', 'sklearn']
- test_env.versions(modules)
- students = read_data('data/students.xlsx')
- # STUDENT SHALL CALL PRINT_OVERVIEW AND PRINT_CATEGORICAL FUNCTIONS WITH
- # FILE NAME AS ARGUMENT
- describe_data.print_overview(
- students, file='results/students_overview.txt')
- describe_data.print_categorical(
- students, file='results/students_categorical_data.txt')
- students_X, students_y = preprocess_data(students)
- # data for methods
- # test size 0.25 as default will be used.
- train_X, test_X, train_y, test_y = train_test_split(
- students_X, students_y, test_size=None, train_size=None, random_state=60)
- # STUDENT SHALL CALL CREATED CLASSIFIERS FUNCTIONS
- logistic_regression_classifier(train_X, train_y, test_X, test_y)
- knn_classifier(train_X, train_y, test_X, test_y)
- svm_classifier(train_X, train_y, test_X, test_y)
- naive_bayes_classifier(train_X, train_y, test_X, test_y)
- decision_tree_classifier(train_X, train_y, test_X, test_y)
- print('Done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement