k-NN

# Import the packages used later in the script.
import numpy as np
from math import sqrt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Read in the data.
dataTrain = np.loadtxt("IDSWeedCropTrain.csv", delimiter=",")
dataTest = np.loadtxt("IDSWeedCropTest.csv", delimiter=",")

# Split Input variables and labels.
XTrain = dataTrain[:, :-1]
YTrain = dataTrain[:, -1]
XTest = dataTest[:, :-1]
YTest = dataTest[:, -1]


##### Exercise 1 (Nearest neighbor classification) #####

print("Exercise 1:\n")

### My own implementation of the nearest neighbor classifier (1-NN).

# Function to calculate the euclidean distance between two rows.
def euclDistance(test_row, train_row):
    """
    Function that calculates the euclidian distance between two rows.
    In this assignment, between a row in the test data and a row in the
    training data.
    """

    d = 0
    for i in range(len(test_row)):
        d += (test_row[i] - train_row[i]) ** 2
    distance = sqrt(d)

    return distance


# Function to determine the k-nearest neighbor.
def kNearestNeighbor(train_dataset, train_results, test_row, k=1):
    """
    Function that takes as input a row from a dataset, compares it to all
    rows in the training dataset and calculates euclidian distances. Then,
    the function sorts the rows (and corresponding results) from the training
    dataset according to the calculated distances. Finally, the function returns
    the k nearest neighbors (rows and results from training data).
    """

    dist_list = []
    rows_list = []
    results_list = list(train_results)
    for row in train_dataset:
        distance = euclDistance(test_row=test_row, train_row=row)
        dist_list.append(distance)
        rows_list.append(list(row))

    dist_rows_list = zip(dist_list, rows_list, results_list)
    dist_list_sorted, rows_list_sorted, results_list_sorted = zip(*sorted(dist_rows_list))

    neighbors = np.array(rows_list_sorted[:k])
    results = np.array(results_list_sorted[:k])

    return neighbors, results


# Function to make predictions based on the calculated k nearest neighbors.
def prediction(train_dataset, train_results, test_row, k=1):
    """
    Function that takes as input a training dataset, the results of the
    training dataset, and a row from a test dataset - these parameters
    are used for calling the kNearestNeighbor function.
    Then, each of the results (0.0 and 1.0) are counted and value with the
    highest count is returned.
    """

    neighbors, results = kNearestNeighbor(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
    unique, counts = np.unique(results, return_counts=True)
    unique_tuple, counts_tuple = tuple(unique), tuple(counts)
    index_max = counts_tuple.index(max(counts_tuple))
    predict_value = unique_tuple[index_max]

    return predict_value


# Function for prediction on an entire dataset.
def predictionOnDataset(train_dataset, train_results, test_dataset, k=1):
    """
    Function that takes as input a training dataset, the results of the
    training dataset, and a row from a test dataset - these parameters
    are used for calling the prediction function in a for loop, to apply
    this function to each row in the test dataset. All the predictions
    are stored in a list and returned as a numpy array.
    """

    predict_value_list = []

    for test_row in test_dataset:
        predict_value = prediction(train_dataset=train_dataset, train_results=train_results, test_row=test_row, k=k)
        predict_value_list.append(predict_value)

    predicted_results = np.array(predict_value_list)

    return predicted_results

results_test = predictionOnDataset(XTrain, YTrain, XTest)

### Implementation using scikit-learn for classifier.
knn_sk = KNeighborsClassifier(n_neighbors=1, metric="euclidean")
knn_sk.fit(XTrain, YTrain)

# Determine the classification accuracy of the model.
accTest_sk = accuracy_score(YTest, knn_sk.predict(XTest))
print("Scikit-learn KNN =", accTest_sk)

accTest_own = accuracy_score(YTest, results_test)
print("Own implementatiom KNN =", accTest_own)

print()
print()


##### Exercise 2 (Cross-validation) #####

print("Exercise 2:\n")

# Create indices for CV.
cv = KFold(n_splits=5)

# Loop over CV folds.
for train, test in cv.split(XTrain):
    XTrainCV, XTestCV, YTrainCV, YTestCV = XTrain[train], XTrain[test], YTrain[train], YTrain[test]

# Loop through each of the values of k and store corresponding classification
# error in dictionary.
k_values = [1, 3, 5, 7, 9, 11]
k_dict = {}

for k in k_values:
    results = predictionOnDataset(XTrainCV, YTrainCV, XTestCV, k=k)
    accTest = accuracy_score(YTestCV, results)
    k_dict[k] = accTest

print("Dictionary:")
print(k_dict)
print()
print("As we can see, a value of k equal to either 3 or 5 produces the best results. As such, we just pick one of these two - in this case, we pick k_best = 3.")

k_best = 3

print()
print()


##### Exercise 3 (Evaluation of classification performance) #####

print("Exercise 3:\n")

# Call the predictionOnDataset function defined earlier (but change k to 3),
# then use scikit-learn to measure the test accuracy.
results_k_best = predictionOnDataset(XTrain, YTrain, XTest, k=k_best)
accTest_k_best = accuracy_score(YTest, results_k_best)

print("3-NN (k_best-NN) test accuracy =", accTest_k_best)

print()
print()


##### Exercise 4 (Data normalization) #####

print("Exercise 4:\n")

# Center data.
mean_vector_XTrain = np.mean(XTrain, axis=0)
XTrain_Zn = XTrain - mean_vector_XTrain

mean_vector_XTest = np.mean(XTest, axis=0)
XTest_Zn = XTest - mean_vector_XTest

# Normalize data.
std_vector_XTrain_Zn = np.std(XTrain_Zn, axis=0)
XTrain_Zni = XTrain_Zn / std_vector_XTrain_Zn

std_vector_XTest_Zn = np.std(XTest_Zn, axis=0)
XTest_Zni = XTest_Zn / std_vector_XTest_Zn


### Perform the steps (cross-validation) from Exercise 2

# Create indices for CV.
cv = KFold(n_splits=5)

# Loop over CV folds.
for train, test in cv.split(XTrain_Zni):
    XTrain_Zni_CV, XTest_Zni_CV, YTrainCV, YTestCV = XTrain_Zni[train], XTrain_Zni[test], YTrain[train], YTrain[test]

# Loop through each of the values of k and store corresponding classification
# error in dictionary.
k_values_norm = [1, 3, 5, 7, 9, 11]
k_dict_norm = {}

for k in k_values_norm:
    results = predictionOnDataset(XTrain_Zni_CV, YTrainCV, XTest_Zni_CV, k=k)
    accTest = accuracy_score(YTestCV, results)
    k_dict_norm[k] = accTest

print("Dictionary:")
print(k_dict_norm)
print()
print("As we can see, a value of k equal to either 3, 5 or 7 produces the best results. As such, we just pick one of these three - in this case, we pick k_best_norm = 3, like in the previous exercise.")

k_best_norm = 3

print()


### Perform the steps (Evaluation of classification performance) from Exercise 3.

# Call the predictionOnDataset function defined earlier (but change k to 3),
# then use scikit-learn to measure the test accuracy.
results_k_best_norm = predictionOnDataset(XTrain_Zni, YTrain, XTest_Zni, k=k_best_norm)
accTest_k_best_norm = accuracy_score(YTest, results_k_best_norm)

print("3-NN (k_best_norm-NN) test accuracy =", accTest_k_best_norm)