Advertisement
ivanknu

Untitled

Oct 19th, 2018
534
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.91 KB | None | 0 0
  1. import time
  2. from math import sqrt
  3.  
  4.  
  5. class DisjointSetUnion:
  6.     def __init__(self, size):
  7.         self.parent_ = list(range(size))
  8.         self.ranks_ = [0] * size
  9.  
  10.     def find(self, node):
  11.         if self.parent_[node] != node:
  12.             self.parent_[node] = self.find(self.parent_[node])
  13.         return self.parent_[node]
  14.  
  15.     def union_sets(self, first, second):
  16.         f_root = self.find(first)
  17.         s_root = self.find(second)
  18.         if f_root == s_root:
  19.             return False
  20.         if self.ranks_[f_root] < self.ranks_[s_root]:
  21.             self.parent_[f_root] = s_root
  22.         elif self.ranks_[f_root] > self.ranks_[s_root]:
  23.             self.parent_[s_root] = f_root
  24.         else:
  25.             self.parent_[s_root] = f_root
  26.             self.ranks_[f_root] += 1
  27.         return True
  28.  
  29.     def result(self):
  30.         return list(map(self.find, self.parent_))
  31.  
  32.  
  33. def distance_1(title1, title2, num_of_words):
  34.     # simple vec distance
  35.     id1 = 0
  36.     id2 = 0
  37.     vec = []
  38.     while id1 < len(title1) and id2 < len(title2):
  39.         x = y = 0
  40.         word1 = title1[id1][0]
  41.         word2 = title2[id2][0]
  42.         if word1 <= word2:
  43.             x = title1[id1][1]
  44.             id1 += 1
  45.         if word1 >= word2:
  46.             y = title2[id2][1]
  47.             id2 += 1
  48.         vec.append(abs(x - y))
  49.     while id1 < len(title1):
  50.         vec.append(title1[id1][1])
  51.         id1 += 1
  52.     while id2 < len(title2):
  53.         vec.append(title2[id2][1])
  54.         id2 += 1
  55.     return sqrt(sum(map(lambda x: x * x, vec)))
  56.  
  57.  
  58. def distance_2(title1, title2, num_of_words):
  59.     # manhattan vec distance
  60.     id1 = 0
  61.     id2 = 0
  62.     vec = []
  63.     while id1 < len(title1) and id2 < len(title2):
  64.         x = y = 0
  65.         word1 = title1[id1][0]
  66.         word2 = title2[id2][0]
  67.         if word1 <= word2:
  68.             x = title1[id1][1]
  69.             id1 += 1
  70.         if word1 >= word2:
  71.             y = title2[id2][1]
  72.             id2 += 1
  73.         vec.append(abs(x - y))
  74.     while id1 < len(title1):
  75.         vec.append(title1[id1][1])
  76.         id1 += 1
  77.     while id2 < len(title2):
  78.         vec.append(title2[id2][1])
  79.         id2 += 1
  80.     return sum(vec)
  81.  
  82.  
  83. def distance_3(title1, title2, num_of_words):
  84.     # Pearson
  85.     x = [0] * num_of_words
  86.     for elem in title1:
  87.         x[elem[0] - 1] += elem[1]
  88.     y = [0] * num_of_words
  89.     for elem in title2:
  90.         y[elem[0] - 1] += elem[1]
  91.     x_m = sum(x) / len(x)
  92.     y_m = sum(y) / len(y)
  93.     x = [x[_] - x_m for _ in range(len(x))]
  94.     y = [y[_] - y_m for _ in range(len(x))]
  95.     a = sum([x[_] * y[_] for _ in range(len(x))])
  96.     b = sum([x[_] * x[_] for _ in range(len(x))])
  97.     c = sum([y[_] * y[_] for _ in range(len(x))])
  98.     if a == 0:
  99.         return 1
  100.     return 1 - (a / sqrt(b * c))
  101.  
  102.  
  103. def cluster_1(ver, eds, num_of_clusters):
  104.     # maximizes minimum distance between clusters
  105.     res = DisjointSetUnion(len(ver))
  106.     cur_cluster_num = len(ver)
  107.     eds.sort()
  108.     for i in range(len(eds)):
  109.         dist, first, second = eds[i]
  110.         if res.find(first) != res.find(second):
  111.             res.union_sets(first, second)
  112.             cur_cluster_num -= 1
  113.         if cur_cluster_num == num_of_clusters:
  114.             break
  115.     return res
  116.  
  117.  
  118. def cluster_2(ver, eds, num_of_clusters):
  119.     # minimizes maximum intracluster distance
  120.     res = DisjointSetUnion(len(ver))
  121.     adj_matrix = [[-1.] * len(ver) for i in range(len(ver))]
  122.     for edge in eds:
  123.         adj_matrix[edge[1]][edge[2]] = edge[0]
  124.         adj_matrix[edge[2]][edge[1]] = edge[0]
  125.     c_vertices = {0}
  126.     for i in range(num_of_clusters):
  127.         next_centre = 0
  128.         max_dist = -1
  129.         for j in range(len(ver)):
  130.             min_dist = -1
  131.             if j not in c_vertices:
  132.                 for centre in c_vertices:
  133.                     if min_dist == -1 or min_dist > adj_matrix[j][centre]:
  134.                         min_dist = adj_matrix[j][centre]
  135.                 if max_dist < min_dist:
  136.                     next_centre = j
  137.                     max_dist = min_dist
  138.         c_vertices.add(next_centre)
  139.     for i in range(len(ver)):
  140.         if i not in c_vertices:
  141.             min_dist = -1
  142.             to_cluster = 0
  143.             for centre in c_vertices:
  144.                 if min_dist == -1 or min_dist > adj_matrix[centre][i]:
  145.                     min_dist = adj_matrix[centre][i]
  146.                     to_cluster = centre
  147.             res.union_sets(to_cluster, i)
  148.     return res
  149.  
  150.  
  151. t_start = time.time()
  152.  
  153. # input file
  154. filename = 'test_4.txt'
  155.  
  156. # function of distance
  157. distance = distance_2
  158.  
  159. # clustering type
  160. cluster = cluster_2
  161.  
  162. # clusters num
  163. clusters_num = 5
  164.  
  165. # word freq param
  166. min_count = 2
  167. max_count = 10
  168.  
  169. vertices = []
  170. with open(filename) as f:
  171.     doc_num = int(f.readline())
  172.     words_num = int(f.readline())
  173.     lines_num = int(f.readline())
  174.     words_count = [0] * words_num
  175.     vertices = [[] for _ in range(doc_num)]
  176.     for i in range(lines_num):
  177.         docID, wordID, count = map(int, f.readline().split())
  178.         words_count[wordID - 1] += count
  179.         if count <= max_count:
  180.             vertices[docID - 1].append([wordID, count])
  181.  
  182. new_v = []
  183. for elem in vertices:
  184.     new_v.append([])
  185.     for k in elem:
  186.         if min_count <= words_count[k[0] - 1] <= max_count:
  187.             new_v[-1].append(k)
  188.     if len(new_v[-1]) == 0:
  189.         print("Warning: vertex without words")
  190. vertices = new_v
  191. print('Vertices:', len(vertices))
  192.  
  193. edges = [(distance(vertices[i], vertices[j], words_num), i, j) for i in range(len(vertices) - 1) for j in
  194.          range(i + 1, len(vertices))]
  195. print('Edges:', len(edges))
  196.  
  197. components = cluster(vertices, edges, clusters_num)
  198.  
  199. comp = components.result()
  200. d = dict()
  201. for i in range(len(comp)):
  202.     if comp[i] not in d:
  203.         d[comp[i]] = [i + 1]
  204.     else:
  205.         d[comp[i]].append(i + 1)
  206. for elem in d:
  207.     print("Cluster", elem, ":", len(d[elem]), "elems", ":", d[elem])
  208.  
  209. print("Time:", time.time() - t_start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement