Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #coding=utf-8
- import hashlib
- hash_len = 32*4
- def hex2bin(hex_str):
- global hash_len
- res = ""
- dec_num = int(hex_str,16)
- bin_str = bin(dec_num)[2:]
- return ('0'*(hash_len-len(bin_str)))+ bin_str
- def Getmd5(s):
- tmp = hashlib.md5()
- tmp.update(s)
- return hex2bin(tmp.hexdigest())
- def SimHash(_str):
- global hash_len
- _sum = [0] * hash_len
- for ch in _str:
- num_list = [int(ch=='1')*2-1 for ch in list(Getmd5(ch))]
- for i in range(len(num_list)):
- _sum[i] += num_list[i]
- hash_num = "".join(map(lambda x:str(int(x >= 0)),_sum))
- # return int(finger_print,2)
- return hash_num
- def HammingDistinct(bin_str1, bin_str2):
- tmp = int(bin_str1,2) ^ int(bin_str2,2)
- ct = 0
- while tmp != 0:
- ct += tmp & 0x01
- tmp = tmp >> 1
- return ct
- if __name__ == "__main__":
- s1 = "this is a hell"
- s2 = "that is an apple"
- s3 = "hello world"
- print HammingDistinct(SimHash(s1),SimHash(s2))
- print HammingDistinct(SimHash(s2),SimHash(s3))
- print HammingDistinct(SimHash(s1),SimHash(s3))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement