Guest User

Untitled

a guest
Nov 24th, 2017
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.67 KB | None | 0 0
  1. import sys
  2. from math import log, ceil
  3. import mmh3
  4.  
  5. class BloomFilter:
  6.  
  7. def __init__(self, n, p=0.01):
  8. ''' n is size of unique lines.
  9. p is false positive rate
  10. '''
  11. self.size = int(ceil((-n*log(p))/ (log(2)**2)))
  12. self.hash_count = int(ceil((self.size/n) * log(2)))
  13. self.filter = [False for _ in range(self.size)]
  14.  
  15. def add(self, item):
  16. # Adds data to bloom filter
  17. hash_keys = []
  18. for seed in range(self.hash_count):
  19. key = mmh3.hash(item, seed) % self.size
  20. hash_keys.append(key)
  21.  
  22. if all([self.filter[i] for i in hash_keys]):
  23. return False
  24. # If above doesn't return it means it can be added
  25. for key in hash_keys:
  26. self.filter[key] = True
  27. return True
Add Comment
Please, Sign In to add comment