Advertisement
eXFq7GJ1cC

Untitled

Jan 2nd, 2013
169
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.97 KB | None | 0 0
  1. import array
  2. import collections
  3. import itertools
  4.  
  5. # Placeholder constants
  6. FREE = -1
  7. DUMMY = -2
  8.  
  9. class Dict(collections.MutableMapping):
  10.     'Space efficient dictionary with fast iteration and cheap resizes.'
  11.  
  12.     @staticmethod
  13.     def _gen_probes(hashvalue, n):
  14.         'Same sequence of probes used in the current dictionary design'
  15.         PERTURB_SHIFT = 5
  16.         if hashvalue < 0:
  17.             hashvalue = -hashvalue
  18.         i = hashvalue & (n-1)
  19.         yield i
  20.         perturb = hashvalue
  21.         while True:
  22.             i = 5 * i + perturb + 1
  23.             yield i & (n-1)
  24.             perturb >>= PERTURB_SHIFT
  25.  
  26.     def _lookup(self, key, hashvalue):
  27.         'Same lookup logic as currently used in real dicts'
  28.         assert self.filled < len(self.indices)   # At least one open slot
  29.         freeslot = None
  30.         for i in self._gen_probes(hashvalue, len(self.indices)):
  31.             index = self.indices[i]
  32.             if index == FREE:
  33.                 return (FREE, i) if freeslot is None else (DUMMY, freeslot)
  34.             elif index == DUMMY:
  35.                 freeslot = i
  36.             elif (self.keylist[index] is key or
  37.                   self.hashlist[index] == hashvalue
  38.                   and self.keylist[index] == key):
  39.                     return (index, i)
  40.  
  41.     @staticmethod
  42.     def _make_index(n):
  43.         'New sequence of indices using the smallest possible datatype'
  44.         if n <= 2**7: return array.array('b', [FREE]) * n       # signed char
  45.         if n <= 2**15: return array.array('h', [FREE]) * n      # signed short
  46.         if n <= 2**31: return array.array('l', [FREE]) * n      # signed long
  47.         return [FREE] * n                                       # python integers
  48.  
  49.     def _resize(self, n):
  50.         '''Reindex the existing hash/key/value entries.
  51.           Entries do not get moved, they only get new indices.
  52.           No calls are made to hash() or __eq__().
  53.  
  54.        '''
  55.         n = 2 ** n.bit_length()                     # round-up to power-of-two
  56.         self.indices = self._make_index(n)
  57.         for index, hashvalue in enumerate(self.hashlist):
  58.             for i in Dict._gen_probes(hashvalue, n):
  59.                 if self.indices[i] == FREE:
  60.                     break
  61.             self.indices[i] = index
  62.         self.filled = self.used
  63.  
  64.     def clear(self):
  65.         self.indices = self._make_index(8)
  66.         self.hashlist = []
  67.         self.keylist = []
  68.         self.valuelist = []
  69.         self.used = 0
  70.         self.filled = 0                                         # used + dummies
  71.  
  72.     def __init__(self, *args, **kwds):
  73.         try:
  74.             self.keylist
  75.         except AttributeError:
  76.             self.clear()
  77.         self.update(*args, **kwds)
  78.  
  79.     def __getitem__(self, key):
  80.         hashvalue = hash(key)
  81.         index, i = self._lookup(key, hashvalue)
  82.         if index < 0:
  83.             raise KeyError(key)
  84.         return self.valuelist[index]
  85.  
  86.     def __setitem__(self, key, value):
  87.         hashvalue = hash(key)
  88.         index, i = self._lookup(key, hashvalue)
  89.         if index < 0:
  90.             self.indices[i] = self.used
  91.             self.hashlist.append(hashvalue)
  92.             self.keylist.append(key)
  93.             self.valuelist.append(value)
  94.             self.used += 1
  95.             if index == FREE:
  96.                 self.filled += 1
  97.                 if self.filled * 3 > len(self.indices) * 2:
  98.                     self._resize(4 * len(self))
  99.         else:
  100.             self.valuelist[index] = value
  101.  
  102.     def __delitem__(self, key):
  103.         hashvalue = hash(key)
  104.         index, i = self._lookup(key, hashvalue)
  105.         if index < 0:
  106.             raise KeyError(key)
  107.         self.indices[i] = DUMMY
  108.         self.used -= 1
  109.         # If needed, swap with the lastmost entry to avoid leaving a "hole"
  110.         if index != self.used:
  111.             lasthash = self.hashlist[-1]
  112.             lastkey = self.keylist[-1]
  113.             lastvalue = self.valuelist[-1]
  114.             lastindex, j = self._lookup(lastkey, lasthash)
  115.             assert lastindex >= 0 and i != j
  116.             self.indices[j] = index
  117.             self.hashlist[index] = lasthash
  118.             self.keylist[index] = lastkey
  119.             self.valuelist[index] = lastvalue
  120.         # Remove the lastmost entry
  121.         self.hashlist.pop()
  122.         self.keylist.pop()
  123.         self.valuelist.pop()
  124.  
  125.     def __len__(self):
  126.         return self.used
  127.  
  128.     def __iter__(self):
  129.         return iter(self.keylist)
  130.  
  131.     def iterkeys(self):
  132.         return iter(self.keylist)
  133.  
  134.     def keys(self):
  135.         return list(self.keylist)
  136.  
  137.     def itervalues(self):
  138.         return iter(self.valuelist)
  139.  
  140.     def values(self):
  141.         return list(self.valuelist)
  142.  
  143.     def iteritems(self):
  144.         return itertools.izip(self.keylist, self.valuelist)
  145.  
  146.     def items(self):
  147.         return zip(self.keylist, self.valuelist)
  148.  
  149.     def __contains__(self, key):
  150.         index, i = self._lookup(key, hash(key))
  151.         return index >= 0
  152.  
  153.     def get(self, key, default=None):
  154.         'D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.'
  155.         index, i = self._lookup(key, hash(key))
  156.         return self.valuelist[index] if index >= 0 else default
  157.  
  158.     def popitem(self):
  159.         ''' D.popitem() -> (k, v), remove and return some (key, value) pair as a
  160.            2-tuple; but raise KeyError if D is empty.
  161.        '''
  162.         try:
  163.             key = self.keylist[-1]
  164.             value = self.valuelist[-1]
  165.         except IndexError:
  166.             raise KeyError( 'popitem(): dictionary is empty')
  167.         del self[key]
  168.         return key, value
  169.  
  170.     def __repr__(self):
  171.         return 'Dict(%r)' % self.items()
  172.  
  173.     def show_structure(self):
  174.         'Diagnostic method.  Not part of the API.'
  175.         print '=' * 50
  176.         print self
  177.         print 'Indices:', self.indices
  178.         for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)):
  179.             print i, row
  180.         print '-' * 50
  181.  
  182.  
  183. import string
  184. import random
  185. import timeit
  186.  
  187. if __name__ == '__main__':
  188.     clsnames = ('dict', 'Dict')
  189.     print '{:6} {:30} {:15} {:15}'.format('Size', 'Test', 'Built-in', 'Proposed')
  190.     for test_size in (5, 10, 50, 100, 500, 1000, 5000, 10000, 100000):
  191.         test_data = [(''.join(random.choice(string.ascii_letters) for x in xrange(random.randint(5, 12))), random.randint(0, 2000000000)) for y in xrange(test_size)]
  192.         for testname, loopcode, setupcode in (
  193.                 ('creation', 'd = {clsname}(test_data)', 'from __main__ import test_data, Dict'),
  194.                 ('get value', '; '.join(['d.get(v)'] * 50), 'from __main__ import test_data, Dict; import random; d = {clsname}(test_data); v = random.choice(test_data)[0]'),
  195.                 ('iteration with items()', 'for k, v in d.items(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  196.                 ('iteration with iteritems()', 'for k, v in d.iteritems(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  197.                 ('iteration with keys()', 'for k in d.keys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  198.                 ('iteration with iterkeys()', 'for k in d.iterkeys(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  199.                 ('iteration with values()', 'for v in d.values(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  200.                 ('iteration with itervalues()', 'for v in d.itervalues(): pass', 'from __main__ import test_data, Dict; d = {clsname}(test_data)'),
  201.                 ):
  202.             results = [min(timeit.repeat(loopcode.format(clsname=clsname), setupcode.format(clsname=clsname), repeat=3, number=int(1e6 / test_size)))
  203.                             for clsname in clsnames]
  204.             fastest = min(results)
  205.             formatted = ['--' if r == fastest else '{:.3f}x slower'.format(r / fastest) for r in results]
  206.             print '{:<6} {:30} {:<15} {:<15}'.format(test_size, testname, formatted[0], formatted[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement