Advertisement
Guest User

Untitled

a guest
Nov 29th, 2015
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.06 KB | None | 0 0
  1. #coding=utf-8
  2. import hashlib
  3.  
  4. hash_len = 32*4
  5.  
  6. def hex2bin(hex_str):
  7. global hash_len
  8. res = ""
  9. dec_num = int(hex_str,16)
  10. bin_str = bin(dec_num)[2:]
  11. return ('0'*(hash_len-len(bin_str)))+ bin_str
  12.  
  13. def Getmd5(s):
  14. tmp = hashlib.md5()
  15. tmp.update(s)
  16. return hex2bin(tmp.hexdigest())
  17.  
  18. def SimHash(_str):
  19. global hash_len
  20. _sum = [0] * hash_len
  21. for ch in _str:
  22. num_list = [int(ch=='1')*2-1 for ch in list(Getmd5(ch))]
  23. for i in range(len(num_list)):
  24. _sum[i] += num_list[i]
  25. hash_num = "".join(map(lambda x:str(int(x >= 0)),_sum))
  26. # return int(finger_print,2)
  27. return hash_num
  28.  
  29. def HammingDistinct(bin_str1, bin_str2):
  30. tmp = int(bin_str1,2) ^ int(bin_str2,2)
  31. ct = 0
  32. while tmp != 0:
  33. ct += tmp & 0x01
  34. tmp = tmp >> 1
  35. return ct
  36.  
  37.  
  38. if __name__ == "__main__":
  39. s1 = "this is a hell"
  40. s2 = "that is an apple"
  41. s3 = "hello world"
  42. print HammingDistinct(SimHash(s1),SimHash(s2))
  43. print HammingDistinct(SimHash(s2),SimHash(s3))
  44. print HammingDistinct(SimHash(s1),SimHash(s3))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement