Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Import the packages used later in the script.
- import numpy as np
- from math import sqrt
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import accuracy_score
- from sklearn.model_selection import KFold
- # Read in the data.
- dataTrain = np.loadtxt("IDSWeedCropTrain.csv", delimiter=",")
- dataTest = np.loadtxt("IDSWeedCropTest.csv", delimiter=",")
- # Split Input variables and labels.
- XTrain = dataTrain[:, :-1]
- YTrain = dataTrain[:, -1]
- XTest = dataTest[:, :-1]
- YTest = dataTest[:, -1]
- ##### Exercise 1 (Nearest neighbor classification) #####
- print("Exercise 1:\n")
- ### My own implementation of the nearest neighbor classifier (1-NN).
- # Function to calculate the euclidean distance between two rows.
- def euclDistance(test_row, train_row):
- """
- Function that calculates the euclidian distance between two rows.
- In this assignment, between a row in the test data and a row in the
- training data.
- """
- d = 0
- for i in range(len(test_row)):
- d += (test_row[i] - train_row[i]) ** 2
- distance = sqrt(d)
- return distance
- # Function to determine the k-nearest neighbor.
- def kNearestNeighbor(train_dataset, train_results, test_row, k=1):
- """
- Function that takes as input a row from a dataset, compares it to all
- rows in the training dataset and calculates euclidian distances. Then,
- the function sorts the rows (and corresponding results) from the training
- dataset according to the calculated distances. Finally, the function returns
- the k nearest neighbors (rows and results from training data).
- """
- dist_list = []
- rows_list = []
- results_list = list(train_results)
- for row in train_dataset:
- distance = euclDistance(test_row=test_row, train_row=row)
- dist_list.append(distance)
- rows_list.append(list(row))
- dist_rows_list = zip(dist_list, rows_list, results_list)
- dist_list_sorted, rows_list_sorted, results_list_sorted = zip(*sorted(dist_rows_list))
- neighbors = np.array(rows_list_sorted[:k])
- results = np.array(results_list_sorted[:k])
- return neighbors, results
- # Function to make predictions based on the calculated k nearest neighbors.
- def prediction(train_dataset, train_results, test_row, k=1):
- """
- Function that takes as input a training dataset, the results of the
- training dataset, and a row from a test dataset - these parameters
- are used for calling the kNearestNeighbor function.
- Then, each of the results (0.0 and 1.0) are counted and value with the
- highest count is returned.
- """
- neighbors, results = kNearestNeighbor(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
- unique, counts = np.unique(results, return_counts=True)
- unique_tuple, counts_tuple = tuple(unique), tuple(counts)
- index_max = counts_tuple.index(max(counts_tuple))
- predict_value = unique_tuple[index_max]
- return predict_value
- # Function for prediction on an entire dataset.
- def predictionOnDataset(train_dataset, train_results, test_dataset, k=1):
- """
- Function that takes as input a training dataset, the results of the
- training dataset, and a row from a test dataset - these parameters
- are used for calling the prediction function in a for loop, to apply
- this function to each row in the test dataset. All the predictions
- are stored in a list and returned as a numpy array.
- """
- predict_value_list = []
- for test_row in test_dataset:
- predict_value = prediction(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
- predict_value_list.append(predict_value)
- predicted_results = np.array(predict_value_list)
- return predicted_results
- results_test = predictionOnDataset(XTrain, YTrain, XTest)
- ### Implementation using scikit-learn for classifier.
- knn_sk = KNeighborsClassifier(n_neighbors=1, metric="euclidean")
- knn_sk.fit(XTrain, YTrain)
- # Determine the classification accuracy of the model.
- accTest_sk = accuracy_score(YTest, knn_sk.predict(XTest))
- print("Scikit-learn KNN =", accTest_sk)
- accTest_own = accuracy_score(YTest, results_test)
- print("Own implementatiom KNN =", accTest_own)
- print()
- print()
- ##### Exercise 2 (Cross-validation) #####
- print("Exercise 2:\n")
- # Create indices for CV.
- cv = KFold(n_splits=5)
- # Loop over CV folds.
- for train, test in cv.split(XTrain):
- XTrainCV, XTestCV, YTrainCV, YTestCV = XTrain[train], XTrain[test], YTrain[train], YTrain[test]
- # Loop through each of the values of k and store corresponding classification
- # error in dictionary.
- k_values = [1, 3, 5, 7, 9, 11]
- k_dict = {}
- for k in k_values:
- results = predictionOnDataset(XTrainCV, YTrainCV, XTestCV, k=k)
- accTest = accuracy_score(YTestCV, results)
- k_dict[k] = accTest
- print("Dictionary:")
- print(k_dict)
- print()
- print("As we can see, a value of k equal to either 3 or 5 produces the best results. As such, we just pick one of these two - in this case, we pick k_best = 3.")
- k_best = 3
- print()
- print()
- ##### Exercise 3 (Evaluation of classification performance) #####
- print("Exercise 3:\n")
- # Call the predictionOnDataset function defined earlier (but change k to 3),
- # then use scikit-learn to measure the test accuracy.
- results_k_best = predictionOnDataset(XTrain, YTrain, XTest, k=k_best)
- accTest_k_best = accuracy_score(YTest, results_k_best)
- print("3-NN (k_best-NN) test accuracy =", accTest_k_best)
- print()
- print()
- ##### Exercise 4 (Data normalization) #####
- print("Exercise 4:\n")
- # Center data.
- mean_vector_XTrain = np.mean(XTrain, axis=0)
- XTrain_Zn = XTrain - mean_vector_XTrain
- mean_vector_XTest = np.mean(XTest, axis=0)
- XTest_Zn = XTest - mean_vector_XTest
- # Normalize data.
- std_vector_XTrain_Zn = np.std(XTrain_Zn, axis=0)
- XTrain_Zni = XTrain_Zn / std_vector_XTrain_Zn
- std_vector_XTest_Zn = np.std(XTest_Zn, axis=0)
- XTest_Zni = XTest_Zn / std_vector_XTest_Zn
- ### Perform the steps (cross-validation) from Exercise 2
- # Create indices for CV.
- cv = KFold(n_splits=5)
- # Loop over CV folds.
- for train, test in cv.split(XTrain_Zni):
- XTrain_Zni_CV, XTest_Zni_CV, YTrainCV, YTestCV = XTrain_Zni[train], XTrain_Zni[test], YTrain[train], YTrain[test]
- # Loop through each of the values of k and store corresponding classification
- # error in dictionary.
- k_values_norm = [1, 3, 5, 7, 9, 11]
- k_dict_norm = {}
- for k in k_values_norm:
- results = predictionOnDataset(XTrain_Zni_CV, YTrainCV, XTest_Zni_CV, k=k)
- accTest = accuracy_score(YTestCV, results)
- k_dict_norm[k] = accTest
- print("Dictionary:")
- print(k_dict_norm)
- print()
- print("As we can see, a value of k equal to either 3, 5 or 7 produces the best results. As such, we just pick one of these three - in this case, we pick k_best_norm = 3, like in the previous exercise.")
- k_best_norm = 3
- print()
- ### Perform the steps (Evaluation of classification performance) from Exercise 3.
- # Call the predictionOnDataset function defined earlier (but change k to 3),
- # then use scikit-learn to measure the test accuracy.
- results_k_best_norm = predictionOnDataset(XTrain_Zni, YTrain, XTest_Zni, k=k_best_norm)
- accTest_k_best_norm = accuracy_score(YTest, results_k_best_norm)
- print("3-NN (k_best_norm-NN) test accuracy =", accTest_k_best_norm)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement