Guest User

Untitled

a guest
Aug 19th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.05 KB | None | 0 0
  1. import numpy as np
  2.  
  3. class KNearestNeighbor:
  4. """ a kNN classifier with L2 distance """
  5.  
  6. def __init__(self):
  7. pass
  8.  
  9. def train(self, X, y):
  10. """
  11. Train the classifier. For k-nearest neighbors this is just
  12. memorizing the training data.
  13.  
  14. Input:
  15. X - A num_train x dimension array where each row is a training point.
  16. y - A vector of length num_train, where y[i] is the label for X[i, :]
  17. """
  18. self.X_train = X
  19. self.y_train = y
  20.  
  21. def predict(self, X, k=1, num_loops=0):
  22. """
  23. Predict labels for test data using this classifier.
  24.  
  25. Input:
  26. X - A num_test x dimension array where each row is a test point.
  27. k - The number of nearest neighbors that vote for predicted label
  28. num_loops - Determines which method to use to compute distances
  29. between training points and test points.
  30.  
  31. Output:
  32. y - A vector of length num_test, where y[i] is the predicted label for the
  33. test point X[i, :].
  34. """
  35. if num_loops == 0:
  36. dists = self.compute_distances_no_loops(X)
  37. elif num_loops == 1:
  38. dists = self.compute_distances_one_loop(X)
  39. elif num_loops == 2:
  40. dists = self.compute_distances_two_loops(X)
  41. else:
  42. raise ValueError('Invalid value %d for num_loops' % num_loops)
  43.  
  44. return self.predict_labels(dists, k=k)
  45.  
  46. def compute_distances_two_loops(self, X):
  47. """
  48. Compute the distance between each test point in X and each training point
  49. in self.X_train using a nested loop over both the training data and the
  50. test data.
  51.  
  52. Input:
  53. X - An num_test x dimension array where each row is a test point.
  54.  
  55. Output:
  56. dists - A num_test x num_train array where dists[i, j] is the distance
  57. between the ith test point and the jth training point.
  58. """
  59. num_test = X.shape[0]
  60. num_train = self.X_train.shape[0]
  61. dists = np.zeros((num_test, num_train))
  62. for i in xrange(num_test):
  63. for j in xrange(num_train):
  64. #####################################################################
  65. # TODO: #
  66. # Compute the l2 distance between the ith test point and the jth #
  67. # training point, and store the result in dists[i, j] #
  68. #####################################################################
  69. dists[i,j] = np.sqrt(np.sum(np.square(X[i]-self.X_train[j]))) #usere loesung
  70. # dists[i,j] = np.sqrt(np.sum((X[i] - self.X_train[j]) ** 2)) #erste loesung
  71. # dists[i,j] = np.sqrt(((X[i,:]-self.X_train[j,:])**2).sum(axis=0))
  72. # dists[i,j] = np.sqrt(np.sum((X[i,:]-self.X_train[j,:])**2,axis=0))
  73. # dists[i,j] = np.sqrt(np.sum(np.square(X[i,:]-self.X_train[j,:]),axis=1))
  74. #####################################################################
  75. # END OF YOUR CODE #
  76. #####################################################################
  77. return dists
  78.  
  79. def compute_distances_one_loop(self, X):
  80. """
  81. Compute the distance between each test point in X and each training point
  82. in self.X_train using a single loop over the test data.
  83.  
  84. Input / Output: Same as compute_distances_two_loops
  85. """
  86. num_test = X.shape[0]
  87. num_train = self.X_train.shape[0]
  88. dists = np.zeros((num_test, num_train))
  89. for i in xrange(num_test):
  90. #######################################################################
  91. # TODO: #
  92. # Compute the l2 distance between the ith test point and all training #
  93. # points, and store the result in dists[i, :]. #
  94. #######################################################################
  95. #dists[i,:] = np.linalg.norm(X[i,:]-self.X_train,axis = 1)
  96. dists[i,:]= np.sqrt(np.sum(np.square(X[i,:]-self.X_train),axis=1)) #unsere loesung
  97. #######
  98. # X[i].shape = (3072,) -> broadcast to fit X_train shape. erste loesung
  99. #dists[i] = np.sqrt(np.sum((X[i] - self.X_train) ** 2, axis=1))
  100. #######################################################################
  101. # END OF YOUR CODE #
  102. #######################################################################
  103. return dists
  104.  
  105. def compute_distances_no_loops(self, X):
  106. """
  107. Compute the distance between each test point in X and each training point
  108. in self.X_train using no explicit loops.
  109.  
  110. Input / Output: Same as compute_distances_two_loops
  111. """
  112. num_test = X.shape[0]
  113. num_train = self.X_train.shape[0]
  114. dists = np.zeros((num_test, num_train))
  115. #########################################################################
  116. # TODO: #
  117. # Compute the l2 distance between all test points and all training #
  118. # points without using any explicit loops, and store the result in #
  119. # dists. #
  120. # HINT: Try to formulate the l2 distance using matrix multiplication #
  121. # and two broadcast sums. #
  122. #########################################################################
  123. #X1 = np.zeros((X.shape[0],1,X.shape[1]))
  124. #X1[:,0,:] = X
  125. #X_train1 = np.zeros((1,self.X_train.shape[0],self.X_train.shape[1]))
  126. #X_train1[0,:,:] = self.X_train
  127. #dists = np.linalg.norm(X_train1-X1,axis = 2)
  128. dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis] + (self.X_train**2).sum(axis=1) - 2 * X.dot(self.X_train.T))
  129. #####
  130. ## from scipy.spatial.distance import cdist
  131. ## dists = cdist(X, self.X_train, metric='euclidean')
  132. ## to fully vectorize, use of the formula: (a-b)^2 = a^2 + b^2 -2ab
  133. ## (a-b)^2 = quadra -2 * prod
  134. ## with quadra = a^2 + b^2; and prod = ab
  135. #a2 = np.sum(X ** 2, axis=1) # shape: (500,)
  136. #b2 = np.sum(self.X_train ** 2, axis=1) # shape: (5000,)
  137. #print a2.shape
  138. #aa2 = a2.reshape(a2.shape[0], 1) # reshape a2 to (500,1) to be able to broadcast a2 and sum to b2
  139. #print aa2.shape
  140. #quadra = aa2 + b2 # shape = (500, 5000)
  141. #prod = np.dot(X, self.X_train.T) # shape = (500, 5000)
  142. #dists = np.sqrt(aa2 + b2 -2 * np.dot(X, self.X_train.T)) # shape = (500, 5000)
  143. #########################################################################
  144. # END OF YOUR CODE #
  145. #########################################################################
  146. return dists
  147.  
  148. def predict_labels(self, dists, k=1):
  149. """
  150. Given a matrix of distances between test points and training points,
  151. predict a label for each test point.
  152.  
  153. Input:
  154. dists - A num_test x num_train array where dists[i, j] gives the distance
  155. between the ith test point and the jth training point.
  156.  
  157. Output:
  158. y - A vector of length num_test where y[i] is the predicted label for the
  159. ith test point.
  160. """
  161. num_test = dists.shape[0]
  162. y_pred = np.zeros(num_test)
  163. for i in xrange(num_test):
  164. # A list of length k storing the labels of the k nearest neighbors to
  165. # the ith test point.
  166. closest_y = []
  167. #########################################################################
  168. # TODO: #
  169. # Use the distance matrix to find the k nearest neighbors of the ith #
  170. # training point, and use self.y_train to find the labels of these #
  171. # neighbors. Store these labels in closest_y. #
  172. # Hint: Look up the function numpy.argsort. #
  173. #########################################################################
  174. closest_y = np.array(self.y_train[np.argsort(dists[i,:],axis=0)[:k]]) #unsere loesung
  175. #closest_y = np.array(self.y_train[np.argsort(dists[i,:],axis=0)[:k]],dtype=float) #unsere loesung
  176. ####
  177. #ind = np.argsort(dists[i, :], axis=0) #erste loesung
  178. #closest_y = self.y_train[ind[:k]]
  179. #########################################################################
  180. # TODO: #
  181. # Now that you have found the labels of the k nearest neighbors, you #
  182. # need to find the most common label in the list closest_y of labels. #
  183. # Store this label in y_pred[i]. Break ties by choosing the smaller #
  184. # label. #
  185. #########################################################################
  186. #d = {x:closest_y.count(x) for x in closest_y}
  187. #c, b = d.keys(), d.values()
  188. #e=c[b.index(max(b))]
  189. #y_pred[i] = self.y_train[closest_y]
  190. y_pred[i] = np.bincount(closest_y).argmax() # meine loesung worked
  191.  
  192. ########### worked
  193. #from scipy.stats import mode # zweite loesung meine loesong
  194. #y_pred[i] = mode(closest_y)[0][0]
  195. ############### zweite loesung worked but the accuracy is a disaster
  196. #from collections import Counter
  197. ## pick nearest neighbors
  198. #closest_y = self.y_train[np.argsort(dists[i]) < k]
  199.  
  200. ## count which class appears most
  201. #y_pred[i] = Counter(closest_y).most_common(1)[0][0]
  202. ############## erste loesung worked
  203. #types = np.unique(closest_y)
  204. ## print 'types', types
  205. #closest_y_list = closest_y.tolist()
  206. #occ = [closest_y_list.count(lab) for lab in types]
  207. ## print 'occ:', occ
  208. #y_pred[i] = types[np.argmax(occ)]
  209. ## print 'pred: ', y_pred[i]
  210.  
  211. #########################################################################
  212. # END OF YOUR CODE #
  213. #########################################################################
  214.  
  215. return y_pred
  216.  
  217. # with the CIFAR dataset
  218. # Two loop version took 80.492011 seconds
  219. # One loop version took 112.854208 seconds
  220. # No loop version took 13.636812 seconds
  221.  
  222. # accuracy with k=10:
  223. # Got 141 / 500 correct => accuracy: 0.282000
  224.  
  225. # Final accuracy, after 5-fold cross-validation to choose best_k=50:
  226. # Got 293 / 500 correct => accuracy: 0.586000
Add Comment
Please, Sign In to add comment