Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from past.builtins import xrange
- from collections import Counter
- import random
- from tensorflow.keras import datasets
- class KNearestNeighbor(object):
- """ a kNN classifier with L2 distance """
- def __init__(self):
- pass
- def train(self, X, y):
- """
- Train the classifier. For k-nearest neighbors this is just
- memorizing the training data.
- Inputs:
- - X: A numpy array of shape (num_train, D) containing the training data
- consisting of num_train samples each of dimension D.
- - y: A numpy array of shape (N,) containing the training labels, where
- y[i] is the label for X[i].
- """
- self.X_train = X
- self.y_train = y
- def predict(self, X, k=1, num_loops=0):
- """
- Predict labels for test data using this classifier.
- Inputs:
- - X: A numpy array of shape (num_test, D) containing test data consisting
- of num_test samples each of dimension D.
- - k: The number of nearest neighbors that vote for the predicted labels.
- - num_loops: Determines which implementation to use to compute distances
- between training points and testing points.
- Returns:
- - y: A numpy array of shape (num_test,) containing predicted labels for the
- test data, where y[i] is the predicted label for the test point X[i].
- """
- if num_loops == 0:
- dists = self.compute_distances_no_loops(X)
- elif num_loops == 1:
- dists = self.compute_distances_one_loop(X)
- elif num_loops == 2:
- dists = self.compute_distances_two_loops(X)
- else:
- raise ValueError('Invalid value %d for num_loops' % num_loops)
- return self.predict_labels(dists, k=k)
- def compute_distances_two_loops(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using a nested loop over both the training data and the
- test data.
- Inputs:
- - X: A numpy array of shape (num_test, D) containing test data.
- Returns:
- - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
- is the Euclidean distance between the ith test point and the jth training
- point.
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- for i in xrange(num_test):
- for j in xrange(num_train):
- #####################################################################
- # TODO: #
- # Compute the l2 distance between the ith test point and the jth #
- # training point, and store the result in dists[i, j]. You should #
- # not use a loop over dimension. #
- #####################################################################
- #dists[i, j] = np.sqrt(np.sum((X[i, :] - self.X_train[j, :]) ** 2))
- dists[i, j] = np.sqrt(np.sum(np.square(X[i, :] - self.X_train[j, :])))
- #####################################################################
- # END OF YOUR CODE #
- #####################################################################
- return dists
- def compute_distances_one_loop(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using a single loop over the test data.
- Input / Output: Same as compute_distances_two_loops
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- for i in xrange(num_test):
- #######################################################################
- # TODO: #
- # Compute the l2 distance between the ith test point and all training #
- # points, and store the result in dists[i, :]. #
- #######################################################################
- dists[i, :] = np.sqrt(np.sum(np.square(self.X_train - X[i, :]), axis = 1))
- #######################################################################
- # END OF YOUR CODE #
- #######################################################################
- print(dists.shape)
- return dists
- def compute_distances_no_loops(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using no explicit loops.
- Input / Output: Same as compute_distances_two_loops
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- #########################################################################
- # TODO: #
- # Compute the l2 distance between all test points and all training #
- # points without using any explicit loops, and store the result in #
- # dists. #
- # #
- # You should implement this function using only basic array operations; #
- # in particular you should not use functions from scipy. #
- # #
- # HINT: Try to formulate the l2 distance using matrix multiplication #
- # and two broadcast sums. #
- #########################################################################
- dists = np.sqrt(-2 * np.dot(X, self.X_train.T) +
- np.sum(np.square(self.X_train), axis=1) +
- np.sum(np.square(X), axis=1)[:, np.newaxis])
- print(dists.shape)
- #########################################################################
- # END OF YOUR CODE #
- #########################################################################
- return dists
- def predict_labels(self, dists, k=1):
- """
- Given a matrix of distances between test points and training points,
- predict a label for each test point.
- Inputs:
- - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
- gives the distance between the ith test point and the jth training point.
- Returns:
- - y: A numpy array of shape (num_test,) containing predicted labels for the
- test data, where y[i] is the predicted label for the test point X[i].
- """
- num_test = dists.shape[0]
- y_pred = np.zeros(num_test)
- for i in xrange(num_test):
- # A list of length k storing the labels of the k nearest neighbors to
- # the ith test point.
- #########################################################################
- # TODO: #
- # Use the distance matrix to find the k nearest neighbors of the ith #
- # testing point, and use self.y_train to find the labels of these #
- # neighbors. Store these labels in closest_y. #
- # Hint: Look up the function numpy.argsort. #
- #########################################################################
- sortix = np.argsort(dists[i, :])
- closest_y = self.y_train[sortix[:min(k, len(sortix))]]
- #########################################################################
- # TODO: #
- # Now that you have found the labels of the k nearest neighbors, you #
- # need to find the most common label in the list closest_y of labels. #
- # Store this label in y_pred[i]. Break ties by choosing the smaller #
- # label. #
- #########################################################################
- #print(self.y_train.shape)
- #print(sortix.shape)
- #print(len(sortix))
- #print(sortix[:min(k, len(sortix))])
- #print(closest_y)
- #print(type(closest_y))
- y_pred[i] = Counter(closest_y[0]).most_common(1)[0][0]
- #########################################################################
- # END OF YOUR CODE #
- #########################################################################
- return y_pred
- ## Using KNearestNeighbor class
- #------------------------------
- # Load the raw CIFAR-10 data.
- (X_train, y_train), (X_test, y_test) = datasets.cifar10.load_data()
- # As a sanity check, we print out the size of the training and test data.
- print('Training data shape: ', X_train.shape)
- print('Training labels shape: ', y_train.shape)
- print('Test data shape: ', X_test.shape)
- print('Test labels shape: ', y_test.shape)
- # Subsample the data for more efficient code execution in this exercise
- num_training = 5000
- mask = list(range(num_training))
- X_train = X_train[mask]
- y_train = y_train[mask]
- num_test = 500
- mask = list(range(num_test))
- X_test = X_test[mask]
- y_test = y_test[mask]
- # Reshape the image data into rows
- X_train = np.reshape(X_train, (X_train.shape[0], -1))
- X_test = np.reshape(X_test, (X_test.shape[0], -1))
- print(X_train.shape, X_test.shape)
- # Create a kNN classifier instance.
- # Remember that training a kNN classifier is a noop:
- # the Classifier simply remembers the data and does no further processing
- classifier = KNearestNeighbor()
- classifier.train(X_train, y_train)
- # Test your implementation:
- dists = classifier.compute_distances_two_loops(X_test)
- dists_one = classifier.compute_distances_one_loop(X_test)
- # To ensure that our vectorized implementation is correct, we make sure that it
- # agrees with the naive implementation. There are many ways to decide whether
- # two matrices are similar; one of the simplest is the Frobenius norm. In case
- # you haven't seen it before, the Frobenius norm of two matrices is the square
- # root of the squared sum of differences of all elements; in other words, reshape
- # the matrices into vectors and compute the Euclidean distance between them.
- difference = np.linalg.norm(dists - dists_one, ord='fro')
- print('Difference was: %f' % (difference, ))
- if difference < 0.001:
- print('Good! The distance matrices are the same')
- else:
- print('Uh-oh! The distance matrices are different')
- dists_two = classifier.compute_distances_no_loops(X_test)
- # check that the distance matrix agrees with the one we computed before:
- difference = np.linalg.norm(dists - dists_two, ord='fro')
- print('Difference was: %f' % (difference, ))
- if difference < 0.001:
- print('Good! The distance matrices are the same')
- else:
- print('Uh-oh! The distance matrices are different')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement