Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import array
- import collections
- import itertools
- # Placeholder constants
- FREE = -1
- DUMMY = -2
- class Dict(collections.MutableMapping):
- 'Space efficient dictionary with fast iteration and cheap resizes.'
- @staticmethod
- def _gen_probes(hashvalue, n):
- 'Same sequence of probes used in the current dictionary design'
- PERTURB_SHIFT = 5
- if hashvalue < 0:
- hashvalue = -hashvalue
- i = hashvalue & (n-1)
- yield i
- perturb = hashvalue
- while True:
- i = 5 * i + perturb + 1
- yield i & (n-1)
- perturb >>= PERTURB_SHIFT
- def _lookup(self, key, hashvalue):
- 'Same lookup logic as currently used in real dicts'
- assert self.filled < len(self.indices) # At least one open slot
- freeslot = None
- for i in self._gen_probes(hashvalue, len(self.indices)):
- index = self.indices[i]
- if index == FREE:
- return (FREE, i) if freeslot is None else (DUMMY, freeslot)
- elif index == DUMMY:
- freeslot = i
- elif (self.keylist[index] is key or
- self.hashlist[index] == hashvalue
- and self.keylist[index] == key):
- return (index, i)
- @staticmethod
- def _make_index(n):
- 'New sequence of indices using the smallest possible datatype'
- if n <= 2**7: return array.array('b', [FREE]) * n # signed char
- if n <= 2**15: return array.array('h', [FREE]) * n # signed short
- if n <= 2**31: return array.array('l', [FREE]) * n # signed long
- return [FREE] * n # python integers
- def _resize(self, n):
- '''Reindex the existing hash/key/value entries.
- Entries do not get moved, they only get new indices.
- No calls are made to hash() or __eq__().
- '''
- n = 2 ** n.bit_length() # round-up to power-of-two
- self.indices = self._make_index(n)
- for index, hashvalue in enumerate(self.hashlist):
- for i in Dict._gen_probes(hashvalue, n):
- if self.indices[i] == FREE:
- break
- self.indices[i] = index
- self.filled = self.used
- def clear(self):
- self.indices = self._make_index(8)
- self.hashlist = []
- self.keylist = []
- self.valuelist = []
- self.used = 0
- self.filled = 0 # used + dummies
- def __init__(self, *args, **kwds):
- try:
- self.keylist
- except AttributeError:
- self.clear()
- self.update(*args, **kwds)
- def __getitem__(self, key):
- hashvalue = hash(key)
- index, i = self._lookup(key, hashvalue)
- if index < 0:
- raise KeyError(key)
- return self.valuelist[index]
- def __setitem__(self, key, value):
- hashvalue = hash(key)
- index, i = self._lookup(key, hashvalue)
- if index < 0:
- self.indices[i] = self.used
- self.hashlist.append(hashvalue)
- self.keylist.append(key)
- self.valuelist.append(value)
- self.used += 1
- if index == FREE:
- self.filled += 1
- if self.filled * 3 > len(self.indices) * 2:
- self._resize(4 * len(self))
- else:
- self.valuelist[index] = value
- def __delitem__(self, key):
- hashvalue = hash(key)
- index, i = self._lookup(key, hashvalue)
- if index < 0:
- raise KeyError(key)
- self.indices[i] = DUMMY
- self.used -= 1
- # If needed, swap with the lastmost entry to avoid leaving a "hole"
- if index != self.used:
- lasthash = self.hashlist[-1]
- lastkey = self.keylist[-1]
- lastvalue = self.valuelist[-1]
- lastindex, j = self._lookup(lastkey, lasthash)
- assert lastindex >= 0 and i != j
- self.indices[j] = index
- self.hashlist[index] = lasthash
- self.keylist[index] = lastkey
- self.valuelist[index] = lastvalue
- # Remove the lastmost entry
- self.hashlist.pop()
- self.keylist.pop()
- self.valuelist.pop()
- def __len__(self):
- return self.used
- def __iter__(self):
- return iter(self.keylist)
- def iterkeys(self):
- return iter(self.keylist)
- def keys(self):
- return list(self.keylist)
- def itervalues(self):
- return iter(self.valuelist)
- def values(self):
- return list(self.valuelist)
- def iteritems(self):
- return itertools.izip(self.keylist, self.valuelist)
- def items(self):
- return zip(self.keylist, self.valuelist)
- def __contains__(self, key):
- index, i = self._lookup(key, hash(key))
- return index >= 0
- def get(self, key, default=None):
- 'D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.'
- index, i = self._lookup(key, hash(key))
- return self.valuelist[index] if index >= 0 else default
- def popitem(self):
- ''' D.popitem() -> (k, v), remove and return some (key, value) pair as a
- 2-tuple; but raise KeyError if D is empty.
- '''
- try:
- key = self.keylist[-1]
- value = self.valuelist[-1]
- except IndexError:
- raise KeyError( 'popitem(): dictionary is empty')
- del self[key]
- return key, value
- def __repr__(self):
- return 'Dict(%r)' % self.items()
- def show_structure(self):
- 'Diagnostic method. Not part of the API.'
- print '=' * 50
- print self
- print 'Indices:', self.indices
- for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)):
- print i, row
- print '-' * 50
- import string
- import random
- import timeit
- if __name__ == '__main__':
- clsnames = ('dict', 'Dict')
- print '{:6} {:30} {:15} {:15}'.format('Size', 'Test', 'Built-in', 'Proposed')
- for test_size in (5, 10, 50, 100, 500, 1000, 5000, 10000, 100000):
- test_data = [(''.join(random.choice(string.ascii_letters) for x in xrange(random.randint(5, 12))), random.randint(0, 2000000000)) for y in xrange(test_size)]
- for testname, loopcode, setupcode in (
- ('creation', 'd = {clsname}(test_data)', 'from __main__ import test_data, Dict'),
- ('get value', '; '.join(['d.get(v)'] * 50), 'from __main__ import test_data, Dict; import random; d = {clsname}(test_data); v = random.choice(test_data)[0]'),
- ('iteration with items()', 'for k, v in d.items(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ('iteration with iteritems()', 'for k, v in d.iteritems(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ('iteration with keys()', 'for k in d.keys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ('iteration with iterkeys()', 'for k in d.iterkeys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ('iteration with values()', 'for v in d.values(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ('iteration with itervalues()', 'for v in d.itervalues(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
- ):
- results = [min(timeit.repeat(loopcode.format(clsname=clsname), setupcode.format(clsname=clsname), repeat=3, number=int(1e6 / test_size)))
- for clsname in clsnames]
- fastest = min(results)
- formatted = ['--' if r == fastest else '{:.3f}x slower'.format(r / fastest) for r in results]
- print '{:<6} {:30} {:<15} {:<15}'.format(test_size, testname, formatted[0], formatted[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement