Advertisement
Guest User

Untitled

a guest
Mar 28th, 2017
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.18 KB | None | 0 0
  1. from hashlib import md5
  2. import time
  3. import sys
  4.  
  5.  
  6. class SimHash(object):
  7.     def __init__(self):
  8.         self.shs = []
  9.  
  10.     def compare(self, queries):
  11.         diff = []
  12.  
  13.         for i, k in queries:
  14.             counter = 0
  15.             pairs = {}
  16.  
  17.             for indx, sh in enumerate(self.shs):
  18.  
  19.                 if i == indx:
  20.                     continue
  21.  
  22.                 key = str(i) + " - " + str(indx)
  23.                 key_r = str(indx) + " - " + str(i)
  24.  
  25.                 if key in pairs:
  26.                     n_diff = pairs[key]
  27.                 else:
  28.                     n_diff = bin(int('0b' + self.shs[i], 2) ^ int('0b' + sh, 2)).count("1")
  29.                     pairs[key] = n_diff
  30.                     pairs[key_r] = n_diff
  31.  
  32.                 if n_diff <= k:
  33.                     counter += 1
  34.  
  35.             diff.append(counter)
  36.  
  37.         return diff
  38.  
  39.     def simhash(self, texts):
  40.         for text in texts:
  41.             hashes = [bin(int(md5(word).hexdigest(), 16))[2:].zfill(128) for word in text.split()]
  42.  
  43.             sh = [2 * temp.count("1") - len(temp) for temp in zip(*hashes)]
  44.             sh = ''.join(map(str, [1 if sh[i] >= 0 else 0 for i in range(len(sh))]))
  45.  
  46.             self.shs.append(sh)
  47.  
  48.  
  49. def readInput():
  50.     lines = sys.stdin.readlines()
  51.     #lines = open(self.filepath).readlines()
  52.  
  53.     N = int(lines[0])
  54.     Q = int(lines[N + 1])
  55.  
  56.     texts = lines[1:N + 1]
  57.     queries = [tuple(map(int, querry.split())) for querry in lines[N + 2:]]
  58.  
  59.     return texts, queries
  60.  
  61.  
  62. def main():
  63.     texts, queries = readInput()
  64.  
  65.     h = SimHash()
  66.  
  67.     #hash_time = time.time()
  68.     h.simhash(texts)
  69.     #hash_time = time.time() - hash_time
  70.     #print "Hashing time: ", hash_time
  71.  
  72.     #querry_time = time.time()
  73.     ret = h.compare(queries)
  74.     #querry_time = time.time() - querry_time
  75.     #print "Querry time: ", querry_time
  76.  
  77.     #print "Total time: ", querry_time + hash_time
  78.  
  79.     #lines = open(OUTPUT).readlines()
  80.     #lines = [int(line.rstrip()) for line in lines]
  81.  
  82.     #print "Accuracy: ", sum([1 if l1 == l2 else 0 for l1, l2 in zip(ret, lines)]) / float(len(ret))
  83.  
  84.     for item in ret:
  85.         print item
  86.  
  87. if __name__=="__main__":
  88.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement