Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- from math import log, ceil
- import mmh3
- class BloomFilter:
- def __init__(self, n, p=0.01):
- ''' n is size of unique lines.
- p is false positive rate
- '''
- self.size = int(ceil((-n*log(p))/ (log(2)**2)))
- self.hash_count = int(ceil((self.size/n) * log(2)))
- self.filter = [False for _ in range(self.size)]
- def add(self, item):
- # Adds data to bloom filter
- hash_keys = []
- for seed in range(self.hash_count):
- key = mmh3.hash(item, seed) % self.size
- hash_keys.append(key)
- if all([self.filter[i] for i in hash_keys]):
- return False
- # If above doesn't return it means it can be added
- for key in hash_keys:
- self.filter[key] = True
- return True
Add Comment
Please, Sign In to add comment