Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from hashlib import md5
- import time
- import sys
- class SimHash(object):
- def __init__(self):
- self.shs = []
- def compare(self, queries):
- diff = []
- for i, k in queries:
- counter = 0
- pairs = {}
- for indx, sh in enumerate(self.shs):
- if i == indx:
- continue
- key = str(i) + " - " + str(indx)
- key_r = str(indx) + " - " + str(i)
- if key in pairs:
- n_diff = pairs[key]
- else:
- n_diff = bin(int('0b' + self.shs[i], 2) ^ int('0b' + sh, 2)).count("1")
- pairs[key] = n_diff
- pairs[key_r] = n_diff
- if n_diff <= k:
- counter += 1
- diff.append(counter)
- return diff
- def simhash(self, texts):
- for text in texts:
- hashes = [bin(int(md5(word).hexdigest(), 16))[2:].zfill(128) for word in text.split()]
- sh = [2 * temp.count("1") - len(temp) for temp in zip(*hashes)]
- sh = ''.join(map(str, [1 if sh[i] >= 0 else 0 for i in range(len(sh))]))
- self.shs.append(sh)
- def readInput():
- lines = sys.stdin.readlines()
- #lines = open(self.filepath).readlines()
- N = int(lines[0])
- Q = int(lines[N + 1])
- texts = lines[1:N + 1]
- queries = [tuple(map(int, querry.split())) for querry in lines[N + 2:]]
- return texts, queries
- def main():
- texts, queries = readInput()
- h = SimHash()
- #hash_time = time.time()
- h.simhash(texts)
- #hash_time = time.time() - hash_time
- #print "Hashing time: ", hash_time
- #querry_time = time.time()
- ret = h.compare(queries)
- #querry_time = time.time() - querry_time
- #print "Querry time: ", querry_time
- #print "Total time: ", querry_time + hash_time
- #lines = open(OUTPUT).readlines()
- #lines = [int(line.rstrip()) for line in lines]
- #print "Accuracy: ", sum([1 if l1 == l2 else 0 for l1, l2 in zip(ret, lines)]) / float(len(ret))
- for item in ret:
- print item
- if __name__=="__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement