Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- %pylab inline
- class Cluster:
- def __init__(self, data):
- self.data = data
- self.number = 0
- self.next = None
- self.prev = None
- self.next_inside = None
- self.prev_inside = None
- #print all clusters
- def print_all(self):
- cluster = self
- while (cluster != None):
- print("########CLUSTER NUMBER", cluster.number, "########")
- print (cluster.data)
- cluster_inside = cluster.next_inside
- while (cluster_inside != None):
- print (cluster_inside.data)
- cluster_inside = cluster_inside.next_inside
- cluster = cluster.next
- #add cluster
- def add_new(self, data, num):
- cluster = self
- while (cluster.next != None):
- cluster = cluster.next
- cl_next = Cluster(data)
- cl_next.number = num
- cl_next.prev = cluster
- cluster.next = cl_next
- def merge_clusters(self, num1, num2):
- cluster = self
- cluster1 = self
- cluster2 = self
- while ((cluster1 != None)and(cluster1.number != num1)):
- cluster1 = cluster1.next
- while ((cluster2 != None)and(cluster2.number != num2)):
- cluster2 = cluster2.next
- cluster4 = cluster2
- while(cluster1.next_inside != None):
- cluster1 = cluster1.next_inside
- cluster1.next_inside = cluster2
- #cluster4.next_inside = cluster2.next_inside
- cluster3 = cluster2.prev
- cluster3.next = cluster2.next
- #we need to count distance from elements of clusters to each other : many to many
- def pair_distance_inside(self, cluster1, cluster2, NUM_ATTRS):
- attributes = NUM_ATTRS
- delta = np.zeros(attributes)
- cluster_main = cluster1
- cluster_pairable = cluster2
- distance = 10000
- while(cluster_pairable != None):
- attr = 0
- distance_inner = 100000
- while(attr < attributes):
- temp = cluster_main.data[0][attr]
- temp2 = cluster_pairable.data[0][attr]
- delta[attr] = temp % temp2
- attr = attr + 1
- attr = 0
- while(attr < attributes):
- delta[attr] = delta[attr] ** 2
- delta[attr] = delta[attr] * weight[attr]
- distance_inner = distance_inner + delta[attr]
- #distance calculated from the first element in inner list
- distance_inner = math.sqrt(distance_inner)
- if(distance_inner < distance):
- distance = distance_inner
- cluster_pairable = cluster_pairable.next_inside
- return distance
- #find nearest clusters
- def find_nearest(distance_matrix, shape, max_num):
- lowest = 10000
- i = shape[0]
- j = shape[1]
- it_i = 0
- it_j = 0
- num1 = 0
- num2 = 0
- while((it_i < i)and(it_i < max_num)):
- while((it_j < j)and(it_j < max_num)):
- if(it_i != it_j):
- if(distance_matrix[it_i][it_j] < lowest):
- lowest = distance_matrix[it_i][it_j]
- num1 = it_i
- num2 = it_j
- it_j = it_j + 1
- it_i = it_i +1
- it_j = 0
- data = [num1, num2, lowest]
- return data
- #while(attr < NUM_COLS):
- #weight[attr] = 1 / weight[attr]
- #print(weight[attr])
- #attr = attr + 1
- #creating a distance matrix by nearest neighbor euristics
- def clusterize(self, max_num, NUM_ATTRS, weight, matrix):
- distance_matrix = matrix
- cluster = self
- attributes = NUM_ATTRS
- while (cluster != None):
- current_num = cluster.number
- cluster_pair = cluster.next
- next_num = cluster_pair.number
- while(cluster_pair != None):
- delta = np.zeros(attributes)
- distance = 0
- attr = 0
- while(attr < attributes):
- temp = cluster.data[0][attr]
- temp2 = cluster_pair.data[0][attr]
- delta[attr] = temp % temp2
- attr = attr + 1
- attr = 0
- while(attr < attributes):
- delta[attr] = delta[attr] ** 2
- delta[attr] = delta[attr] * weight[attr]
- distance = distance + delta[attr]
- #distance calculated from the first element in inner list
- distance = math.sqrt(distance)
- distance_all = pair_distance_inside(cluster, cluster_pair, attributes)
- if(distance_all < distance):
- distance = distance_all
- #entering inside clusters
- cluster_inner = cluster.next_inside
- attr = 0
- while(cluster_inner != None):
- distance_inner = 0
- while(attr < attributes):
- temp = cluster_inner.data[0][attr]
- temp2 = cluster_pair.data[0][attr]
- delta[attr] = temp % temp2
- attr = attr + 1
- attr = 0
- while(attr < attributes):
- delta[attr] = delta[attr] ** 2
- delta[attr] = delta[attr] * weight[attr]
- distance_inner = distance_inner + delta[attr]
- distance_inner = math.sqrt(distance_inner)
- #distance from the nearest cluster element is the distance
- if(distance_inner < distance):
- distance = distance_inner
- distance_all_inside = pair_distance_inside(cluster_inner, cluster_pair, attributes)
- if(distance_all_inside < distance):
- distance = distance_all_inside
- #going deeper inside cluster
- cluster_inner = cluster_inner.next_inside
- #distance matrices
- distance_matrix[current_num][next_num] = distance
- distance_matrix[next_num][current_num] = distance
- distance_matrix[current_num][current_num] = 999999
- distance_matrix[next_num][next_num] = 999999
- cluster_pair = cluster_pair.next
- next_num = cluster_pair.number
- cluster = cluster.next
- to_merge = find_nearest(distance_matrix, distance_matrix.shape, max_num)
- cluster1_num = to_merge[0]
- cluster2_num = to_merge[1]
- it_j = 0
- cluster2_distances = np.zeros(max_num)
- while(it_j < max_num):
- temp = distance_matrix[it_j][cluster2_num]
- cluster2_distances[j] = temp
- it_j = 0
- cluster1_distances = np.zeros(max_num)
- while(it_j < max_num):
- temp = distance_matrix[it_j][cluster1_num]
- cluster1_distances[j] = temp
- it_i = 0
- it_j = cluster2_num
- #removing cluster2 from distance_matrix
- while(it_i < max_num):
- distance_matrix[it_i][cluster2_num] = 999999
- distance_matrix[cluster2_num][it_i] = 999999
- #Lance-Williams formula coefficients
- it_i = 0
- Au = 1/2
- Av = 1/2
- Y = -1/2
- RWS = 0
- while((it_i < max_num)and(it_i != cluster2_num)and(it_i != cluster1_num)):
- #Lance-Williams formula
- RWS = Au*(cluster2_distances[it_i]) + Av*(cluster1_distances[it_i]) + Y*(cluster2_distances[it_i] % cluster1_distances[it_i])
- distance_matrix[it_i][cluster1_num] = RWS
- merge_clusters(cluster1_num, cluster2_num)
- return distance_matrix
- #main
- sample = np.loadtxt("/Users/a1234/Desktop/Datasets/data.txt", delimiter = ',')
- NUM_ROWS = sample.shape[0] #number of rows
- NUM_COLS = sample.shape[1] #number of columns
- NUM_CLUSTERS = NUM_ROWS #number of clusters
- array = sample[0:1:4]
- Clusters = Cluster(array)
- Clusters.number = 0
- weight = np.zeros(NUM_COLS)
- count = 1
- #simple clusters by 1 objects
- while (count < NUM_ROWS):
- data = sample[count:(count+1):NUM_COLS]
- #counting weight for each attribute
- attr = 0
- while(attr < NUM_COLS):
- temp = data[0][attr]
- if(weight[attr] < temp):
- weight[attr] = temp
- attr = attr+1
- Clusters.add_new(data,count+1)
- count = count+1
- attr = 0
- while(attr < NUM_COLS):
- weight[attr] = 1 / weight[attr]
- print(weight[attr])
- attr = attr + 1
- #Clusters.print_all()
- distance_matrix = np.zeros((NUM_CLUSTERS,NUM_CLUSTERS))
- Clusters.clusterize(NUM_CLUSTERS, NUM_COLS, weight, distance_matrix)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement