Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import time
- class Node():
- def __init__(self, point):
- self.point = point
- self.left = None
- self.right = None
- class Search():
- def __init__(self, best_point, best_distance):
- self.best_point = best_point
- self.best_distance = best_distance
- def data_mit_index(data):
- #np.random.shuffle(data)
- A = np.copy(data)
- Indexvektor = np.arange(data.shape[0])
- A[:, 0] = Indexvektor
- return A
- def median(list):
- if len(list) % 2 == 0:
- return np.median(np.delete(list,0,0))
- return np.median(list)
- def kD_tree(data, level):
- num_points = data.shape[0]
- if num_points > 1:
- dim = data.shape[1] - 1
- r = level % dim
- auswahl = data[0:101, r + 1]
- point_index = auswahl.tolist().index(median(auswahl))
- point = auswahl[point_index]
- condlist1 = data[1:, r + 1] <= point
- P1 = data[1:,:][condlist1]
- condlist2 = data[1:, r + 1] > point
- P2 = data[1:,:][condlist2]
- v = Node(data[point_index,:])
- v.left = kD_tree(P1, level + 1)
- #if v.left != None:
- # v.left.parent = v
- v.right = kD_tree(P2, level + 1)
- #if v.right != None:
- # v.right.parent = v
- return v
- elif num_points == 1:
- end = Node(data[0,:])
- return end
- def sucher_setup(x, k, data):
- e = np.ones((k, 1))
- X = e * x
- NORM_squared = np.sum((X - data[0:k, 1:]) * (X - data[0:k, 1:]), axis=1)
- NORM = np.sqrt(NORM_squared)
- sorted_index = np.argsort(NORM)
- NORM_sorted = np.zeros(k)
- BEST_POINT = np.zeros((k, data.shape[1]))
- for i in range(0, k):
- NORM_sorted[i] = NORM[sorted_index[k - i - 1]]
- BEST_POINT[i, :] = data[sorted_index[k - i - 1], :]
- return Search(BEST_POINT, NORM_sorted)
- def kNN_search(x, k, node, level, sucher):
- dim = x.shape[0]
- r = level % dim
- distance = np.linalg.norm(x - node.point[1:])
- if node.point[0] >= k:
- for j in range(0, k):
- if distance >= sucher.best_distance[j]:
- if j == 0:
- break
- else:
- sucher.best_point[0:(j - 1), :] = sucher.best_point[1:j, :]
- sucher.best_point[j - 1, :] = node.point
- sucher.best_distance[0:(j - 1)] = sucher.best_distance[1:j]
- sucher.best_distance[j - 1] = distance
- break
- if distance < sucher.best_distance[k - 1]:
- sucher.best_point[0:(k - 1), :] = sucher.best_point[1:k, :]
- sucher.best_point[k - 1, :] = node.point
- sucher.best_distance[0:(k - 1)] = sucher.best_distance[1:k]
- sucher.best_distance[k - 1] = distance
- if x[r] <= node.point[r + 1]: # Zuerst wird die Seite auf der x liegt untersucht
- if x[r] - sucher.best_distance[0] <= node.point[
- r + 1] and node.left != None: # Aber nur falls überhaupt bessere Punkte in den jeweiligen Kasten liegen können
- kNN_search(x, k, node.left, level + 1, sucher)
- if x[r] + sucher.best_distance[0] > node.point[r + 1] and node.right != None:
- kNN_search(x, k, node.right, level + 1, sucher)
- else:
- if x[r] + sucher.best_distance[0] > node.point[r + 1] and node.right != None:
- kNN_search(x, k, node.right, level + 1, sucher)
- if x[r] - sucher.best_distance[0] <= node.point[r + 1] and node.left != None:
- kNN_search(x, k, node.left, level + 1, sucher)
- def kNN(x, k, data, baum):
- sucher = sucher_setup(x, k, data)
- kNN_search(x, k, baum, 0, sucher)
- return sucher.best_point[:, 0].reshape(k)
- def classify(name,KSET,l):
- filename = name+".train.csv"
- data = np.genfromtxt(filename, delimiter=',')
- data_size = data.shape[0]
- elements = data.shape[0]//l
- np.random.shuffle(data)
- index_data = data_mit_index(data)
- KSET_size = len(KSET)
- k_max = max(KSET)
- KSET_new = KSET.copy()
- KSET_new.remove(k_max)
- R = np.zeros((KSET_size,l))
- KARRAY = np.zeros(KSET_size)
- KARRAY[0 ] = k_max
- tree = []
- D_rest = []
- for i in range(0,l-1):
- D = index_data[i*elements:(i+1)*elements,:]
- D_rest.append(np.delete(index_data,slice(i*elements,(i+1)*elements),0))
- tree.append(kD_tree(D_rest[i], 0))
- classification = np.zeros((KSET_size,elements))
- for j in range(0,elements):
- Indizes_kmax = kNN(D[j, 1:], k_max, D_rest[i], tree[i])
- summe = sum(data[Indizes_kmax.astype(int),0])
- if summe == 0:
- classification[0,j] = 1
- else:
- classification[0,j] = np.sign(summe)
- b = 1
- for k in KSET_new:
- KARRAY[b] = k
- Indizes = Indizes_kmax[(k_max-k):]
- summe = sum(data[Indizes.astype(int), 0])
- if summe == 0:
- classification[b,j] = 1
- else:
- classification[b,j] = np.sign(summe)
- b += 1
- for j in range(0,KSET_size):
- R[j,i] = np.mean(abs(data[D[:,0].astype(int),0] - classification[j,:])/2)
- D = index_data[(l-1)*elements:data_size-1,:]
- m = D.shape[0]
- D_rest.append(np.delete(index_data, slice((l-1) * elements, data_size-1), 0))
- tree.append(kD_tree(D_rest[l-1], 0))
- classification = np.zeros((KSET_size, m))
- for j in range(0, m):
- Indizes_kmax = kNN(D[j, 1:], k_max, D_rest[l-1], tree[l-1])
- summe = sum(data[Indizes_kmax.astype(int), 0])
- if summe == 0:
- classification[0, j] = 1
- else:
- classification[0, j] = np.sign(summe)
- b = 1
- for k in KSET_new:
- Indizes = Indizes_kmax[(k_max - k):]
- summe = sum(data[Indizes.astype(int), 0])
- if summe == 0:
- classification[b, j] = 1
- else:
- classification[b, j] = np.sign(summe)
- b += 1
- for j in range(0, KSET_size):
- R[j, l-1] = np.mean(abs(data[D[:, 0].astype(int), 0] - classification[j, :]) / 2)
- fehler_R= np.mean(R, axis=1).tolist()
- k_best_index = fehler_R.index(min(fehler_R))
- k_best = KARRAY[k_best_index]
- filename_test = name+".test.csv"
- data_test = np.genfromtxt(filename_test, delimiter=',')
- data_test_index = data_mit_index(data_test)
- elements_test = data_test.shape[0]
- classification_test = np.zeros((l, elements_test))
- R_test = np.zeros(l)
- for i in range(0, l):
- for j in range(0, elements_test):
- Indizes_test = (kNN(data_test_index[j, 1:], int(k_best), D_rest[i], tree[i]))
- summe = sum(data[Indizes_test.astype(int), 0])
- if summe == 0:
- classification_test[i, j] = 1
- else:
- classification_test[i, j] = np.sign(summe)
- R_test[i]= np.mean(abs(data_test[:,0] - classification_test[i, :]) / 2)
- f = np.mean(R_test)
- data_test_index[:,0] = np.sign(np.sum(classification_test, axis=0))
- resultname = name+".result.csv"
- np.savetxt(resultname,data_test_index,delimiter=',')
- return f```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement