Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2020 (edited)
1,369
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.37 KB | None | 0 0
  1. import numpy as np
  2. from past.builtins import xrange
  3. from collections import Counter
  4. import random
  5. from tensorflow.keras import datasets
  6.  
  7.  
  8. class KNearestNeighbor(object):
  9.     """ a kNN classifier with L2 distance """
  10.  
  11.     def __init__(self):
  12.         pass
  13.  
  14.     def train(self, X, y):
  15.         """
  16.    Train the classifier. For k-nearest neighbors this is just
  17.    memorizing the training data.
  18.  
  19.    Inputs:
  20.    - X: A numpy array of shape (num_train, D) containing the training data
  21.      consisting of num_train samples each of dimension D.
  22.    - y: A numpy array of shape (N,) containing the training labels, where
  23.         y[i] is the label for X[i].
  24.    """
  25.         self.X_train = X
  26.         self.y_train = y
  27.  
  28.     def predict(self, X, k=1, num_loops=0):
  29.         """
  30.    Predict labels for test data using this classifier.
  31.  
  32.    Inputs:
  33.    - X: A numpy array of shape (num_test, D) containing test data consisting
  34.         of num_test samples each of dimension D.
  35.    - k: The number of nearest neighbors that vote for the predicted labels.
  36.    - num_loops: Determines which implementation to use to compute distances
  37.      between training points and testing points.
  38.  
  39.    Returns:
  40.    - y: A numpy array of shape (num_test,) containing predicted labels for the
  41.      test data, where y[i] is the predicted label for the test point X[i].
  42.    """
  43.         if num_loops == 0:
  44.             dists = self.compute_distances_no_loops(X)
  45.         elif num_loops == 1:
  46.             dists = self.compute_distances_one_loop(X)
  47.         elif num_loops == 2:
  48.             dists = self.compute_distances_two_loops(X)
  49.         else:
  50.             raise ValueError('Invalid value %d for num_loops' % num_loops)
  51.  
  52.         return self.predict_labels(dists, k=k)
  53.  
  54.     def compute_distances_two_loops(self, X):
  55.         """
  56.    Compute the distance between each test point in X and each training point
  57.    in self.X_train using a nested loop over both the training data and the
  58.    test data.
  59.  
  60.    Inputs:
  61.    - X: A numpy array of shape (num_test, D) containing test data.
  62.  
  63.    Returns:
  64.    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
  65.      is the Euclidean distance between the ith test point and the jth training
  66.      point.
  67.    """
  68.         num_test = X.shape[0]
  69.         num_train = self.X_train.shape[0]
  70.         dists = np.zeros((num_test, num_train))
  71.         for i in xrange(num_test):
  72.             for j in xrange(num_train):
  73.                 #####################################################################
  74.                 # TODO:                                                             #
  75.                 # Compute the l2 distance between the ith test point and the jth    #
  76.                 # training point, and store the result in dists[i, j]. You should   #
  77.                 # not use a loop over dimension.                                    #
  78.                 #####################################################################
  79.                 #dists[i, j] = np.sqrt(np.sum((X[i, :] - self.X_train[j, :]) ** 2))
  80.                 dists[i, j] = np.sqrt(np.sum(np.square(X[i, :] - self.X_train[j, :])))
  81.                 #####################################################################
  82.                 #                       END OF YOUR CODE                            #
  83.                 #####################################################################
  84.         return dists
  85.  
  86.     def compute_distances_one_loop(self, X):
  87.         """
  88.    Compute the distance between each test point in X and each training point
  89.    in self.X_train using a single loop over the test data.
  90.  
  91.    Input / Output: Same as compute_distances_two_loops
  92.    """
  93.         num_test = X.shape[0]
  94.         num_train = self.X_train.shape[0]
  95.         dists = np.zeros((num_test, num_train))
  96.         for i in xrange(num_test):
  97.             #######################################################################
  98.             # TODO:                                                               #
  99.             # Compute the l2 distance between the ith test point and all training #
  100.             # points, and store the result in dists[i, :].                        #
  101.             #######################################################################
  102.             dists[i, :] = np.sqrt(np.sum(np.square(self.X_train - X[i, :]), axis = 1))
  103.             #######################################################################
  104.             #                         END OF YOUR CODE                            #
  105.             #######################################################################
  106.         print(dists.shape)
  107.         return dists
  108.  
  109.     def compute_distances_no_loops(self, X):
  110.         """
  111.    Compute the distance between each test point in X and each training point
  112.    in self.X_train using no explicit loops.
  113.  
  114.    Input / Output: Same as compute_distances_two_loops
  115.    """
  116.         num_test = X.shape[0]
  117.         num_train = self.X_train.shape[0]
  118.         dists = np.zeros((num_test, num_train))
  119.         #########################################################################
  120.         # TODO:                                                                 #
  121.         # Compute the l2 distance between all test points and all training      #
  122.         # points without using any explicit loops, and store the result in      #
  123.         # dists.                                                                #
  124.         #                                                                       #
  125.         # You should implement this function using only basic array operations; #
  126.         # in particular you should not use functions from scipy.                #
  127.         #                                                                       #
  128.         # HINT: Try to formulate the l2 distance using matrix multiplication    #
  129.         #       and two broadcast sums.                                         #
  130.         #########################################################################
  131.         dists = np.sqrt(-2 * np.dot(X, self.X_train.T) +
  132.                         np.sum(np.square(self.X_train), axis=1) +
  133.                         np.sum(np.square(X), axis=1)[:, np.newaxis])
  134.         print(dists.shape)
  135.         #########################################################################
  136.         #                         END OF YOUR CODE                              #
  137.         #########################################################################
  138.         return dists
  139.  
  140.     def predict_labels(self, dists, k=1):
  141.         """
  142.    Given a matrix of distances between test points and training points,
  143.    predict a label for each test point.
  144.  
  145.    Inputs:
  146.    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
  147.      gives the distance between the ith test point and the jth training point.
  148.  
  149.    Returns:
  150.    - y: A numpy array of shape (num_test,) containing predicted labels for the
  151.      test data, where y[i] is the predicted label for the test point X[i].
  152.    """
  153.         num_test = dists.shape[0]
  154.         y_pred = np.zeros(num_test)
  155.         for i in xrange(num_test):
  156.             # A list of length k storing the labels of the k nearest neighbors to
  157.             # the ith test point.
  158.             #########################################################################
  159.             # TODO:                                                                 #
  160.             # Use the distance matrix to find the k nearest neighbors of the ith    #
  161.             # testing point, and use self.y_train to find the labels of these       #
  162.             # neighbors. Store these labels in closest_y.                           #
  163.             # Hint: Look up the function numpy.argsort.                             #
  164.             #########################################################################
  165.             sortix = np.argsort(dists[i, :])
  166.             closest_y = self.y_train[sortix[:min(k, len(sortix))]]
  167.             #########################################################################
  168.             # TODO:                                                                 #
  169.             # Now that you have found the labels of the k nearest neighbors, you    #
  170.             # need to find the most common label in the list closest_y of labels.   #
  171.             # Store this label in y_pred[i]. Break ties by choosing the smaller     #
  172.             # label.                                                                #
  173.             #########################################################################
  174.             #print(self.y_train.shape)
  175.             #print(sortix.shape)
  176.             #print(len(sortix))
  177.             #print(sortix[:min(k, len(sortix))])
  178.             #print(closest_y)
  179.             #print(type(closest_y))
  180.             y_pred[i] = Counter(closest_y[0]).most_common(1)[0][0]
  181.             #########################################################################
  182.             #                           END OF YOUR CODE                            #
  183.             #########################################################################
  184.  
  185.         return y_pred
  186.  
  187. ## Using KNearestNeighbor class
  188. #------------------------------
  189. # Load the raw CIFAR-10 data.
  190. (X_train, y_train), (X_test, y_test) = datasets.cifar10.load_data()
  191.  
  192. # As a sanity check, we print out the size of the training and test data.
  193. print('Training data shape: ', X_train.shape)
  194. print('Training labels shape: ', y_train.shape)
  195. print('Test data shape: ', X_test.shape)
  196. print('Test labels shape: ', y_test.shape)
  197.  
  198. # Subsample the data for more efficient code execution in this exercise
  199. num_training = 5000
  200. mask = list(range(num_training))
  201. X_train = X_train[mask]
  202. y_train = y_train[mask]
  203.  
  204. num_test = 500
  205. mask = list(range(num_test))
  206. X_test = X_test[mask]
  207. y_test = y_test[mask]
  208.  
  209. # Reshape the image data into rows
  210. X_train = np.reshape(X_train, (X_train.shape[0], -1))
  211. X_test = np.reshape(X_test, (X_test.shape[0], -1))
  212. print(X_train.shape, X_test.shape)
  213.  
  214. # Create a kNN classifier instance.
  215. # Remember that training a kNN classifier is a noop:
  216. # the Classifier simply remembers the data and does no further processing
  217. classifier = KNearestNeighbor()
  218. classifier.train(X_train, y_train)
  219.  
  220. # Test your implementation:
  221. dists = classifier.compute_distances_two_loops(X_test)
  222.  
  223. dists_one = classifier.compute_distances_one_loop(X_test)
  224.  
  225. # To ensure that our vectorized implementation is correct, we make sure that it
  226. # agrees with the naive implementation. There are many ways to decide whether
  227. # two matrices are similar; one of the simplest is the Frobenius norm. In case
  228. # you haven't seen it before, the Frobenius norm of two matrices is the square
  229. # root of the squared sum of differences of all elements; in other words, reshape
  230. # the matrices into vectors and compute the Euclidean distance between them.
  231. difference = np.linalg.norm(dists - dists_one, ord='fro')
  232. print('Difference was: %f' % (difference, ))
  233. if difference < 0.001:
  234.     print('Good! The distance matrices are the same')
  235. else:
  236.     print('Uh-oh! The distance matrices are different')
  237.  
  238. dists_two = classifier.compute_distances_no_loops(X_test)
  239.  
  240. # check that the distance matrix agrees with the one we computed before:
  241. difference = np.linalg.norm(dists - dists_two, ord='fro')
  242. print('Difference was: %f' % (difference, ))
  243. if difference < 0.001:
  244.     print('Good! The distance matrices are the same')
  245. else:
  246.     print('Uh-oh! The distance matrices are different')
  247.  
  248.  
  249.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement