Untitled

from time import perf_counter
from sklearn.neighbors import KDTree
# from sklearn.preprocessing import normalize
# from scipy import spatial

def true_closest(X_train, X_test, k):
    result = []
    for x0 in X_test:
        bests = list(sorted([(i, np.linalg.norm(x - x0)) for i, x in enumerate(X_train)], key=lambda x: x[1]))
        bests = [i for i, d in bests]
        result.append(bests[:min(k, len(bests))])
    return result

# X, y = read_cancer_dataset('cancer.csv')
X, y = read_spam_dataset('spam.csv')
# X = normalize(X, axis=0, norm='l2')
X_train, y_train, X_test, y_test = train_test_split(X, y, 0.9)
# X_train = np.random.randn(100, 3)
# X_test = np.random.randn(10, 3)

tree = KDTree(X_train, leaf_size=40)

time1 = perf_counter()
_, predicted = tree.query(X_test, k=30)
time1 = perf_counter() - time1

time2 = perf_counter()
true = true_closest(X_train, X_test, k=30)
time2 = perf_counter() - time2
print(time1, time2)
if np.sum(np.abs(np.array(np.array(predicted).shape) - np.array(np.array(true).shape))) != 0:
    print("Wrong shape")
else:
    errors = sum([1 for row1, row2 in zip(predicted, true) for i1, i2 in zip(row1, row2) if i1 != i2])
    if errors > 0:
        print("Encounted", errors, "errors")