Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.model_selection import KFold
- from sklearn.tree import DecisionTreeClassifier
- from sklearn import model_selection,preprocessing, neighbors,metrics
- from sklearn.utils import shuffle
- from sklearn.neighbors import KNeighborsClassifier
- import random
- from sklearn.metrics import accuracy_score
- import matplotlib.pyplot as plt
- import testing,training
- import os
- import numpy as np
- train_X =[]
- test_X =[]
- train_y =[]
- test_y =[]
- max_neighbors =15
- # creating a np array for dtc-->Decision tree classifier and knn .
- np_dtc = np.array([])
- np_knn = np.array([])
- os.mkdir('dtc')
- os.mkdir('knn')
- os.mkdir('accuracy')
- os.mkdir('train_csv')
- os.mkdir('test_csv')
- data_frame = pd.read_table(r"iris.data", sep=",", header=None, names=['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'class'])
- data_frame.head()
- # Converted the last label as classes
- data_frame['class'] = pd.Categorical(data_frame['class'])
- data_frame['class'] = data_frame['class'].cat.codes
- k_fold = model_selection.KFold(n_splits=5)
- X = data_frame[data_frame.columns[:-1]].values
- y = data_frame['class'].values
- # Shuffling the X and y before splitting
- X, y = shuffle(X, y, random_state = 999)
- # Spitting the test set into 5 cross valdiation
- for i, j in k_fold.split(data_frame):
- train_X.append(X[i])
- test_X.append(X[j])
- train_y.append(y[i])
- test_y.append(y[j])
- print(train_X)
- print(train_y)
- for i in range(0,5):
- np.savetxt("train_csv/train_X_"+str(i)+".csv", train_X[i], delimiter=",")
- np.savetxt("train_csv/train_y_"+str(i)+".csv", train_y[i], delimiter=",")
- np.savetxt("test_csv/test_X_"+str(i)+".csv", test_X[i], delimiter=",")
- np.savetxt("test_csv/test_y_"+str(i)+".csv", test_y[i], delimiter=",")
- acc_dtc = np.array([])
- acc_knn = np.array([])
- for i in range(5):
- # This function will train and save the model for dtc in txt format in dtc directory
- training.create_dtc(i)
- # This function will train and save the model for knn for different values of k in txt format in knn directory
- training.create_knn(i,max_neighbors)
- a = testing.run_dtc(i)
- print("Decision Tree Accuracy"+str(a))
- acc_dtc = np.append(acc_dtc,a)
- b = testing.run_knn(i,max_neighbors)
- print("KNN Accuracy"+str(b))
- acc_knn = np.append(acc_knn,b)
- # reshaping in a matrix
- acc_knn = acc_knn.reshape(int(5),max_neighbors)
- # average mean of k= 0-max_neighbours for all 5 fold cross sets
- mean_knn_accuracy = np.mean(acc_knn,axis=0)
- # Mean of dtc accuracies
- # Accuracy may differ due to shuffling
- np.mean(acc_dtc)
- plt.bar([i+1 for i in range(0,max_neighbors)], mean_knn_accuracy)
- plt.ylim([0.9,1])
- plt.xlabel("Value of K")
- plt.ylabel("Mean Accuracy of 5 fold")
- plt.title("Mean Accuracy vs K")
- plt.show()
- Classifier = ('KNN', 'Decision Tree Classifier')
- y_pos = np.arange(len(Classifier))
- plt.bar(np.arange(2), [mean_knn_accuracy[7], np.mean(acc_dtc)])
- plt.ylim([0.9,1])
- plt.title("k-NN and Decision tree accuracy comparison")
- plt.ylabel("Accuracy Score")
- plt.xlabel("Classifiers Name")
- plt.xticks(y_pos, Classifier)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement