Advertisement
Guest User

Untitled

a guest
Oct 14th, 2019
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.64 KB | None | 0 0
  1. import time
  2. from math import sqrt
  3.  
  4.  
  5. class DisjointSetUnion:
  6.     def __init__(self, size):
  7.         self.parent_ = list(range(size))
  8.         self.ranks_ = [0] * size
  9.  
  10.     def find(self, node):
  11.         if self.parent_[node] != node:
  12.             self.parent_[node] = self.find(self.parent_[node])
  13.         return self.parent_[node]
  14.  
  15.     def union_sets(self, first, second):
  16.         f_root = self.find(first)
  17.         s_root = self.find(second)
  18.         if f_root == s_root:
  19.             return False
  20.         if self.ranks_[f_root] < self.ranks_[s_root]:
  21.             self.parent_[f_root] = s_root
  22.         elif self.ranks_[f_root] > self.ranks_[s_root]:
  23.             self.parent_[s_root] = f_root
  24.         else:
  25.             self.parent_[s_root] = f_root
  26.             self.ranks_[f_root] += 1
  27.         return True
  28.  
  29.     def result(self):
  30.         return list(map(self.find, self.parent_))
  31.  
  32.  
  33. def distance_1(title1, title2, num_of_words):
  34.     # simple vec distance
  35.     id1 = 0
  36.     id2 = 0
  37.     vec = []
  38.     while id1 < len(title1) and id2 < len(title2):
  39.         x = y = 0
  40.         word1 = title1[id1][0]
  41.         word2 = title2[id2][0]
  42.         if word1 <= word2:
  43.             x = title1[id1][1]
  44.             id1 += 1
  45.         if word1 >= word2:
  46.             y = title2[id2][1]
  47.             id2 += 1
  48.         vec.append(abs(x - y))
  49.     while id1 < len(title1):
  50.         vec.append(title1[id1][1])
  51.         id1 += 1
  52.     while id2 < len(title2):
  53.         vec.append(title2[id2][1])
  54.         id2 += 1
  55.     return sqrt(sum(map(lambda x: x * x, vec)))
  56.  
  57.  
  58. def distance_2(title1, title2, num_of_words):
  59.     # manhattan vec distance
  60.     id1 = 0
  61.     id2 = 0
  62.     vec = []
  63.     while id1 < len(title1) and id2 < len(title2):
  64.         x = y = 0
  65.         word1 = title1[id1][0]
  66.         word2 = title2[id2][0]
  67.         if word1 <= word2:
  68.             x = title1[id1][1]
  69.             id1 += 1
  70.         if word1 >= word2:
  71.             y = title2[id2][1]
  72.             id2 += 1
  73.         vec.append(abs(x - y))
  74.     while id1 < len(title1):
  75.         vec.append(title1[id1][1])
  76.         id1 += 1
  77.     while id2 < len(title2):
  78.         vec.append(title2[id2][1])
  79.         id2 += 1
  80.     return sum(vec)
  81.  
  82.  
  83. def distance_3(title1, title2, num_of_words):
  84.     # Pearson
  85.     x = [0] * num_of_words
  86.     for elem in title1:
  87.         x[elem[0] - 1] += elem[1]
  88.     y = [0] * num_of_words
  89.     for elem in title2:
  90.         y[elem[0] - 1] += elem[1]
  91.     x_m = sum(x) / len(x)
  92.     y_m = sum(y) / len(y)
  93.     x = [x[_] - x_m for _ in range(len(x))]
  94.     y = [y[_] - y_m for _ in range(len(x))]
  95.     a = sum([x[_] * y[_] for _ in range(len(x))])
  96.     b = sum([x[_] * x[_] for _ in range(len(x))])
  97.     c = sum([y[_] * y[_] for _ in range(len(x))])
  98.     if a == 0:
  99.         return 1
  100.     return 1 - (a / sqrt(b * c))
  101.  
  102.  
  103. def cluster(ver, eds, num_of_clusters):
  104.     # return DisjointSetUnion of edges
  105.     res = DisjointSetUnion(len(ver))
  106.     # Your code here
  107.     eds.sort()
  108.     cnt = len(ver)
  109.     for i in range(len(eds)):
  110.         if cnt == num_of_clusters:
  111.             break
  112.         a = eds[i][1]
  113.         b = eds[i][2]
  114.         if res.find(a) != res.find(b):
  115.             res.union_sets(a, b)
  116.             cnt -= 1
  117.     return res
  118.  
  119.  
  120. t_start = time.time()
  121.  
  122. # input file
  123. filename = '/Users/anton/Downloads/task/test_400.txt'
  124.  
  125. # function of distance
  126. distance = distance_3
  127.  
  128. # clusters num
  129. clusters_num = 5
  130.  
  131. # word freq param
  132. min_count = 2
  133. max_count = 10
  134.  
  135. vertices = []
  136. with open(filename) as f:
  137.     doc_num = int(f.readline())
  138.     words_num = int(f.readline())
  139.     lines_num = int(f.readline())
  140.     words_count = [0] * words_num
  141.     vertices = [[] for _ in range(doc_num)]
  142.     for i in range(lines_num):
  143.         docID, wordID, count = map(int, f.readline().split())
  144.         words_count[wordID - 1] += count
  145.         if count <= max_count:
  146.             vertices[docID - 1].append([wordID, count])
  147.  
  148. new_v = []
  149. for elem in vertices:
  150.     new_v.append([])
  151.     for k in elem:
  152.         if min_count <= words_count[k[0] - 1] <= max_count:
  153.             new_v[-1].append(k)
  154.     if len(new_v[-1]) == 0:
  155.         print("Warning: vertex without words")
  156. vertices = new_v
  157. print('Vertices:', len(vertices))
  158.  
  159. edges = [(distance(vertices[i], vertices[j], words_num), i, j) for i in range(len(vertices) - 1) for j in range(i + 1, len(vertices))]
  160. print('Edges:', len(edges))
  161.  
  162. components = cluster(vertices, edges, clusters_num)
  163.  
  164. comp = components.result()
  165. d = dict()
  166. for i in range(len(comp)):
  167.     if comp[i] not in d:
  168.         d[comp[i]] = [i + 1]
  169.     else:
  170.         d[comp[i]].append(i + 1)
  171. for elem in d:
  172.     print("Cluster", elem, ":", len(d[elem]), "elems", ":", d[elem])
  173.  
  174. print("Time:", time.time() - t_start)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement