Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- class KNearestNeighbor:
- """ a kNN classifier with L2 distance """
- def __init__(self):
- pass
- def train(self, X, y):
- """
- Train the classifier. For k-nearest neighbors this is just
- memorizing the training data.
- Input:
- X - A num_train x dimension array where each row is a training point.
- y - A vector of length num_train, where y[i] is the label for X[i, :]
- """
- self.X_train = X
- self.y_train = y
- def predict(self, X, k=1, num_loops=0):
- """
- Predict labels for test data using this classifier.
- Input:
- X - A num_test x dimension array where each row is a test point.
- k - The number of nearest neighbors that vote for predicted label
- num_loops - Determines which method to use to compute distances
- between training points and test points.
- Output:
- y - A vector of length num_test, where y[i] is the predicted label for the
- test point X[i, :].
- """
- if num_loops == 0:
- dists = self.compute_distances_no_loops(X)
- elif num_loops == 1:
- dists = self.compute_distances_one_loop(X)
- elif num_loops == 2:
- dists = self.compute_distances_two_loops(X)
- else:
- raise ValueError('Invalid value %d for num_loops' % num_loops)
- return self.predict_labels(dists, k=k)
- def compute_distances_two_loops(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using a nested loop over both the training data and the
- test data.
- Input:
- X - An num_test x dimension array where each row is a test point.
- Output:
- dists - A num_test x num_train array where dists[i, j] is the distance
- between the ith test point and the jth training point.
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- for i in xrange(num_test):
- for j in xrange(num_train):
- #####################################################################
- # TODO: #
- # Compute the l2 distance between the ith test point and the jth #
- # training point, and store the result in dists[i, j] #
- #####################################################################
- dists[i,j] = np.sqrt(np.sum(np.square(X[i]-self.X_train[j]))) #usere loesung
- # dists[i,j] = np.sqrt(np.sum((X[i] - self.X_train[j]) ** 2)) #erste loesung
- # dists[i,j] = np.sqrt(((X[i,:]-self.X_train[j,:])**2).sum(axis=0))
- # dists[i,j] = np.sqrt(np.sum((X[i,:]-self.X_train[j,:])**2,axis=0))
- # dists[i,j] = np.sqrt(np.sum(np.square(X[i,:]-self.X_train[j,:]),axis=1))
- #####################################################################
- # END OF YOUR CODE #
- #####################################################################
- return dists
- def compute_distances_one_loop(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using a single loop over the test data.
- Input / Output: Same as compute_distances_two_loops
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- for i in xrange(num_test):
- #######################################################################
- # TODO: #
- # Compute the l2 distance between the ith test point and all training #
- # points, and store the result in dists[i, :]. #
- #######################################################################
- #dists[i,:] = np.linalg.norm(X[i,:]-self.X_train,axis = 1)
- dists[i,:]= np.sqrt(np.sum(np.square(X[i,:]-self.X_train),axis=1)) #unsere loesung
- #######
- # X[i].shape = (3072,) -> broadcast to fit X_train shape. erste loesung
- #dists[i] = np.sqrt(np.sum((X[i] - self.X_train) ** 2, axis=1))
- #######################################################################
- # END OF YOUR CODE #
- #######################################################################
- return dists
- def compute_distances_no_loops(self, X):
- """
- Compute the distance between each test point in X and each training point
- in self.X_train using no explicit loops.
- Input / Output: Same as compute_distances_two_loops
- """
- num_test = X.shape[0]
- num_train = self.X_train.shape[0]
- dists = np.zeros((num_test, num_train))
- #########################################################################
- # TODO: #
- # Compute the l2 distance between all test points and all training #
- # points without using any explicit loops, and store the result in #
- # dists. #
- # HINT: Try to formulate the l2 distance using matrix multiplication #
- # and two broadcast sums. #
- #########################################################################
- #X1 = np.zeros((X.shape[0],1,X.shape[1]))
- #X1[:,0,:] = X
- #X_train1 = np.zeros((1,self.X_train.shape[0],self.X_train.shape[1]))
- #X_train1[0,:,:] = self.X_train
- #dists = np.linalg.norm(X_train1-X1,axis = 2)
- dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis] + (self.X_train**2).sum(axis=1) - 2 * X.dot(self.X_train.T))
- #####
- ## from scipy.spatial.distance import cdist
- ## dists = cdist(X, self.X_train, metric='euclidean')
- ## to fully vectorize, use of the formula: (a-b)^2 = a^2 + b^2 -2ab
- ## (a-b)^2 = quadra -2 * prod
- ## with quadra = a^2 + b^2; and prod = ab
- #a2 = np.sum(X ** 2, axis=1) # shape: (500,)
- #b2 = np.sum(self.X_train ** 2, axis=1) # shape: (5000,)
- #print a2.shape
- #aa2 = a2.reshape(a2.shape[0], 1) # reshape a2 to (500,1) to be able to broadcast a2 and sum to b2
- #print aa2.shape
- #quadra = aa2 + b2 # shape = (500, 5000)
- #prod = np.dot(X, self.X_train.T) # shape = (500, 5000)
- #dists = np.sqrt(aa2 + b2 -2 * np.dot(X, self.X_train.T)) # shape = (500, 5000)
- #########################################################################
- # END OF YOUR CODE #
- #########################################################################
- return dists
- def predict_labels(self, dists, k=1):
- """
- Given a matrix of distances between test points and training points,
- predict a label for each test point.
- Input:
- dists - A num_test x num_train array where dists[i, j] gives the distance
- between the ith test point and the jth training point.
- Output:
- y - A vector of length num_test where y[i] is the predicted label for the
- ith test point.
- """
- num_test = dists.shape[0]
- y_pred = np.zeros(num_test)
- for i in xrange(num_test):
- # A list of length k storing the labels of the k nearest neighbors to
- # the ith test point.
- closest_y = []
- #########################################################################
- # TODO: #
- # Use the distance matrix to find the k nearest neighbors of the ith #
- # training point, and use self.y_train to find the labels of these #
- # neighbors. Store these labels in closest_y. #
- # Hint: Look up the function numpy.argsort. #
- #########################################################################
- closest_y = np.array(self.y_train[np.argsort(dists[i,:],axis=0)[:k]]) #unsere loesung
- #closest_y = np.array(self.y_train[np.argsort(dists[i,:],axis=0)[:k]],dtype=float) #unsere loesung
- ####
- #ind = np.argsort(dists[i, :], axis=0) #erste loesung
- #closest_y = self.y_train[ind[:k]]
- #########################################################################
- # TODO: #
- # Now that you have found the labels of the k nearest neighbors, you #
- # need to find the most common label in the list closest_y of labels. #
- # Store this label in y_pred[i]. Break ties by choosing the smaller #
- # label. #
- #########################################################################
- #d = {x:closest_y.count(x) for x in closest_y}
- #c, b = d.keys(), d.values()
- #e=c[b.index(max(b))]
- #y_pred[i] = self.y_train[closest_y]
- y_pred[i] = np.bincount(closest_y).argmax() # meine loesung worked
- ########### worked
- #from scipy.stats import mode # zweite loesung meine loesong
- #y_pred[i] = mode(closest_y)[0][0]
- ############### zweite loesung worked but the accuracy is a disaster
- #from collections import Counter
- ## pick nearest neighbors
- #closest_y = self.y_train[np.argsort(dists[i]) < k]
- ## count which class appears most
- #y_pred[i] = Counter(closest_y).most_common(1)[0][0]
- ############## erste loesung worked
- #types = np.unique(closest_y)
- ## print 'types', types
- #closest_y_list = closest_y.tolist()
- #occ = [closest_y_list.count(lab) for lab in types]
- ## print 'occ:', occ
- #y_pred[i] = types[np.argmax(occ)]
- ## print 'pred: ', y_pred[i]
- #########################################################################
- # END OF YOUR CODE #
- #########################################################################
- return y_pred
- # with the CIFAR dataset
- # Two loop version took 80.492011 seconds
- # One loop version took 112.854208 seconds
- # No loop version took 13.636812 seconds
- # accuracy with k=10:
- # Got 141 / 500 correct => accuracy: 0.282000
- # Final accuracy, after 5-fold cross-validation to choose best_k=50:
- # Got 293 / 500 correct => accuracy: 0.586000
Add Comment
Please, Sign In to add comment