Advertisement
Porse

k-NN

Feb 19th, 2020
377
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.35 KB | None | 0 0
  1. # Import the packages used later in the script.
  2. import numpy as np
  3. from math import sqrt
  4. from sklearn.neighbors import KNeighborsClassifier
  5. from sklearn.metrics import accuracy_score
  6. from sklearn.model_selection import KFold
  7.  
  8. # Read in the data.
  9. dataTrain = np.loadtxt("IDSWeedCropTrain.csv", delimiter=",")
  10. dataTest = np.loadtxt("IDSWeedCropTest.csv", delimiter=",")
  11.  
  12. # Split Input variables and labels.
  13. XTrain = dataTrain[:, :-1]
  14. YTrain = dataTrain[:, -1]
  15. XTest = dataTest[:, :-1]
  16. YTest = dataTest[:, -1]
  17.  
  18.  
  19.  
  20. ##### Exercise 1 (Nearest neighbor classification) #####
  21.  
  22. print("Exercise 1:\n")
  23.  
  24. ### My own implementation of the nearest neighbor classifier (1-NN).
  25.  
  26. # Function to calculate the euclidean distance between two rows.
  27. def euclDistance(test_row, train_row):
  28.     """
  29.    Function that calculates the euclidian distance between two rows.
  30.    In this assignment, between a row in the test data and a row in the
  31.    training data.
  32.    """
  33.    
  34.     d = 0
  35.     for i in range(len(test_row)):
  36.         d += (test_row[i] - train_row[i]) ** 2
  37.     distance = sqrt(d)
  38.    
  39.     return distance
  40.  
  41.  
  42. # Function to determine the k-nearest neighbor.
  43. def kNearestNeighbor(train_dataset, train_results, test_row, k=1):
  44.     """
  45.    Function that takes as input a row from a dataset, compares it to all
  46.    rows in the training dataset and calculates euclidian distances. Then,
  47.    the function sorts the rows (and corresponding results) from the training
  48.    dataset according to the calculated distances. Finally, the function returns
  49.    the k nearest neighbors (rows and results from training data).
  50.    """
  51.    
  52.     dist_list = []
  53.     rows_list = []
  54.     results_list = list(train_results)
  55.     for row in train_dataset:
  56.         distance = euclDistance(test_row=test_row, train_row=row)
  57.         dist_list.append(distance)
  58.         rows_list.append(list(row))
  59.  
  60.     dist_rows_list = zip(dist_list, rows_list, results_list)
  61.     dist_list_sorted, rows_list_sorted, results_list_sorted = zip(*sorted(dist_rows_list))
  62.    
  63.     neighbors = np.array(rows_list_sorted[:k])
  64.     results = np.array(results_list_sorted[:k])
  65.    
  66.     return neighbors, results
  67.  
  68.  
  69. # Function to make predictions based on the calculated k nearest neighbors.
  70. def prediction(train_dataset, train_results, test_row, k=1):
  71.     """
  72.    Function that takes as input a training dataset, the results of the
  73.    training dataset, and a row from a test dataset - these parameters
  74.    are used for calling the kNearestNeighbor function.
  75.    Then, each of the results (0.0 and 1.0) are counted and value with the
  76.    highest count is returned.
  77.    """
  78.    
  79.     neighbors, results = kNearestNeighbor(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
  80.     unique, counts = np.unique(results, return_counts=True)
  81.     unique_tuple, counts_tuple = tuple(unique), tuple(counts)
  82.     index_max = counts_tuple.index(max(counts_tuple))
  83.     predict_value = unique_tuple[index_max]
  84.    
  85.     return predict_value
  86.  
  87.  
  88. # Function for prediction on an entire dataset.
  89. def predictionOnDataset(train_dataset, train_results, test_dataset, k=1):
  90.     """
  91.    Function that takes as input a training dataset, the results of the
  92.    training dataset, and a row from a test dataset - these parameters
  93.    are used for calling the prediction function in a for loop, to apply
  94.    this function to each row in the test dataset. All the predictions
  95.    are stored in a list and returned as a numpy array.
  96.    """
  97.    
  98.     predict_value_list = []
  99.    
  100.     for test_row in test_dataset:
  101.         predict_value = prediction(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
  102.         predict_value_list.append(predict_value)
  103.    
  104.     predicted_results = np.array(predict_value_list)
  105.    
  106.     return predicted_results
  107.  
  108. results_test = predictionOnDataset(XTrain, YTrain, XTest)
  109.  
  110. ### Implementation using scikit-learn for classifier.
  111. knn_sk = KNeighborsClassifier(n_neighbors=1, metric="euclidean")
  112. knn_sk.fit(XTrain, YTrain)
  113.  
  114. # Determine the classification accuracy of the model.
  115. accTest_sk = accuracy_score(YTest, knn_sk.predict(XTest))
  116. print("Scikit-learn KNN =", accTest_sk)
  117.  
  118. accTest_own = accuracy_score(YTest, results_test)
  119. print("Own implementatiom KNN =", accTest_own)
  120.  
  121. print()
  122. print()
  123.  
  124.  
  125.  
  126. ##### Exercise 2 (Cross-validation) #####
  127.  
  128. print("Exercise 2:\n")
  129.  
  130. # Create indices for CV.
  131. cv = KFold(n_splits=5)
  132.  
  133. # Loop over CV folds.
  134. for train, test in cv.split(XTrain):
  135.     XTrainCV, XTestCV, YTrainCV, YTestCV = XTrain[train], XTrain[test], YTrain[train], YTrain[test]
  136.  
  137. # Loop through each of the values of k and store corresponding classification
  138. # error in dictionary.
  139. k_values = [1, 3, 5, 7, 9, 11]
  140. k_dict = {}
  141.  
  142. for k in k_values:
  143.     results = predictionOnDataset(XTrainCV, YTrainCV, XTestCV, k=k)
  144.     accTest = accuracy_score(YTestCV, results)
  145.     k_dict[k] = accTest
  146.  
  147. print("Dictionary:")
  148. print(k_dict)
  149. print()
  150. print("As we can see, a value of k equal to either 3 or 5 produces the best results. As such, we just pick one of these two - in this case, we pick k_best = 3.")
  151.  
  152. k_best = 3
  153.  
  154. print()
  155. print()
  156.  
  157.  
  158.  
  159. ##### Exercise 3 (Evaluation of classification performance) #####
  160.  
  161. print("Exercise 3:\n")
  162.  
  163. # Call the predictionOnDataset function defined earlier (but change k to 3),
  164. # then use scikit-learn to measure the test accuracy.
  165. results_k_best = predictionOnDataset(XTrain, YTrain, XTest, k=k_best)
  166. accTest_k_best = accuracy_score(YTest, results_k_best)
  167.  
  168. print("3-NN (k_best-NN) test accuracy =", accTest_k_best)
  169.  
  170. print()
  171. print()
  172.  
  173.  
  174. ##### Exercise 4 (Data normalization) #####
  175.  
  176. print("Exercise 4:\n")
  177.  
  178. # Center data.
  179. mean_vector_XTrain = np.mean(XTrain, axis=0)
  180. XTrain_Zn = XTrain - mean_vector_XTrain
  181.  
  182. mean_vector_XTest = np.mean(XTest, axis=0)
  183. XTest_Zn = XTest - mean_vector_XTest
  184.  
  185. # Normalize data.
  186. std_vector_XTrain_Zn = np.std(XTrain_Zn, axis=0)
  187. XTrain_Zni = XTrain_Zn / std_vector_XTrain_Zn
  188.  
  189. std_vector_XTest_Zn = np.std(XTest_Zn, axis=0)
  190. XTest_Zni = XTest_Zn / std_vector_XTest_Zn
  191.  
  192.  
  193. ### Perform the steps (cross-validation) from Exercise 2
  194.  
  195. # Create indices for CV.
  196. cv = KFold(n_splits=5)
  197.  
  198. # Loop over CV folds.
  199. for train, test in cv.split(XTrain_Zni):
  200.     XTrain_Zni_CV, XTest_Zni_CV, YTrainCV, YTestCV = XTrain_Zni[train], XTrain_Zni[test], YTrain[train], YTrain[test]
  201.  
  202. # Loop through each of the values of k and store corresponding classification
  203. # error in dictionary.
  204. k_values_norm = [1, 3, 5, 7, 9, 11]
  205. k_dict_norm = {}
  206.  
  207. for k in k_values_norm:
  208.     results = predictionOnDataset(XTrain_Zni_CV, YTrainCV, XTest_Zni_CV, k=k)
  209.     accTest = accuracy_score(YTestCV, results)
  210.     k_dict_norm[k] = accTest
  211.  
  212. print("Dictionary:")
  213. print(k_dict_norm)
  214. print()
  215. print("As we can see, a value of k equal to either 3, 5 or 7 produces the best results. As such, we just pick one of these three - in this case, we pick k_best_norm = 3, like in the previous exercise.")
  216.  
  217. k_best_norm = 3
  218.  
  219. print()
  220.  
  221.  
  222. ### Perform the steps (Evaluation of classification performance) from Exercise 3.
  223.  
  224. # Call the predictionOnDataset function defined earlier (but change k to 3),
  225. # then use scikit-learn to measure the test accuracy.
  226. results_k_best_norm = predictionOnDataset(XTrain_Zni, YTrain, XTest_Zni, k=k_best_norm)
  227. accTest_k_best_norm = accuracy_score(YTest, results_k_best_norm)
  228.  
  229. print("3-NN (k_best_norm-NN) test accuracy =", accTest_k_best_norm)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement