import array import collections import itertools # Placeholder constants FREE = -1 DUMMY = -2 class Dict(collections.MutableMapping): 'Space efficient dictionary with fast iteration and cheap resizes.' @staticmethod def _gen_probes(hashvalue, n): 'Same sequence of probes used in the current dictionary design' PERTURB_SHIFT = 5 if hashvalue < 0: hashvalue = -hashvalue i = hashvalue & (n-1) yield i perturb = hashvalue while True: i = 5 * i + perturb + 1 yield i & (n-1) perturb >>= PERTURB_SHIFT def _lookup(self, key, hashvalue): 'Same lookup logic as currently used in real dicts' assert self.filled < len(self.indices) # At least one open slot freeslot = None for i in self._gen_probes(hashvalue, len(self.indices)): index = self.indices[i] if index == FREE: return (FREE, i) if freeslot is None else (DUMMY, freeslot) elif index == DUMMY: freeslot = i elif (self.keylist[index] is key or self.hashlist[index] == hashvalue and self.keylist[index] == key): return (index, i) @staticmethod def _make_index(n): 'New sequence of indices using the smallest possible datatype' if n <= 2**7: return array.array('b', [FREE]) * n # signed char if n <= 2**15: return array.array('h', [FREE]) * n # signed short if n <= 2**31: return array.array('l', [FREE]) * n # signed long return [FREE] * n # python integers def _resize(self, n): '''Reindex the existing hash/key/value entries. Entries do not get moved, they only get new indices. No calls are made to hash() or __eq__(). ''' n = 2 ** n.bit_length() # round-up to power-of-two self.indices = self._make_index(n) for index, hashvalue in enumerate(self.hashlist): for i in Dict._gen_probes(hashvalue, n): if self.indices[i] == FREE: break self.indices[i] = index self.filled = self.used def clear(self): self.indices = self._make_index(8) self.hashlist = [] self.keylist = [] self.valuelist = [] self.used = 0 self.filled = 0 # used + dummies def __init__(self, *args, **kwds): try: self.keylist except AttributeError: self.clear() self.update(*args, **kwds) def __getitem__(self, key): hashvalue = hash(key) index, i = self._lookup(key, hashvalue) if index < 0: raise KeyError(key) return self.valuelist[index] def __setitem__(self, key, value): hashvalue = hash(key) index, i = self._lookup(key, hashvalue) if index < 0: self.indices[i] = self.used self.hashlist.append(hashvalue) self.keylist.append(key) self.valuelist.append(value) self.used += 1 if index == FREE: self.filled += 1 if self.filled * 3 > len(self.indices) * 2: self._resize(4 * len(self)) else: self.valuelist[index] = value def __delitem__(self, key): hashvalue = hash(key) index, i = self._lookup(key, hashvalue) if index < 0: raise KeyError(key) self.indices[i] = DUMMY self.used -= 1 # If needed, swap with the lastmost entry to avoid leaving a "hole" if index != self.used: lasthash = self.hashlist[-1] lastkey = self.keylist[-1] lastvalue = self.valuelist[-1] lastindex, j = self._lookup(lastkey, lasthash) assert lastindex >= 0 and i != j self.indices[j] = index self.hashlist[index] = lasthash self.keylist[index] = lastkey self.valuelist[index] = lastvalue # Remove the lastmost entry self.hashlist.pop() self.keylist.pop() self.valuelist.pop() def __len__(self): return self.used def __iter__(self): return iter(self.keylist) def iterkeys(self): return iter(self.keylist) def keys(self): return list(self.keylist) def itervalues(self): return iter(self.valuelist) def values(self): return list(self.valuelist) def iteritems(self): return itertools.izip(self.keylist, self.valuelist) def items(self): return zip(self.keylist, self.valuelist) def __contains__(self, key): index, i = self._lookup(key, hash(key)) return index >= 0 def get(self, key, default=None): 'D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.' index, i = self._lookup(key, hash(key)) return self.valuelist[index] if index >= 0 else default def popitem(self): ''' D.popitem() -> (k, v), remove and return some (key, value) pair as a 2-tuple; but raise KeyError if D is empty. ''' try: key = self.keylist[-1] value = self.valuelist[-1] except IndexError: raise KeyError( 'popitem(): dictionary is empty') del self[key] return key, value def __repr__(self): return 'Dict(%r)' % self.items() def show_structure(self): 'Diagnostic method. Not part of the API.' print '=' * 50 print self print 'Indices:', self.indices for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)): print i, row print '-' * 50 import string import random import timeit if __name__ == '__main__': clsnames = ('dict', 'Dict') print '{:6} {:30} {:15} {:15}'.format('Size', 'Test', 'Built-in', 'Proposed') for test_size in (5, 10, 50, 100, 500, 1000, 5000, 10000, 100000): test_data = [(''.join(random.choice(string.ascii_letters) for x in xrange(random.randint(5, 12))), random.randint(0, 2000000000)) for y in xrange(test_size)] for testname, loopcode, setupcode in ( ('creation', 'd = {clsname}(test_data)', 'from __main__ import test_data, Dict'), ('get value', '; '.join(['d.get(v)'] * 50), 'from __main__ import test_data, Dict; import random; d = {clsname}(test_data); v = random.choice(test_data)[0]'), ('iteration with items()', 'for k, v in d.items(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ('iteration with iteritems()', 'for k, v in d.iteritems(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ('iteration with keys()', 'for k in d.keys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ('iteration with iterkeys()', 'for k in d.iterkeys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ('iteration with values()', 'for v in d.values(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ('iteration with itervalues()', 'for v in d.itervalues(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'), ): results = [min(timeit.repeat(loopcode.format(clsname=clsname), setupcode.format(clsname=clsname), repeat=3, number=int(1e6 / test_size))) for clsname in clsnames] fastest = min(results) formatted = ['--' if r == fastest else '{:.3f}x slower'.format(r / fastest) for r in results] print '{:<6} {:30} {:<15} {:<15}'.format(test_size, testname, formatted[0], formatted[1])