Advertisement
agf

Huffman Codec and BitWriter / BitReader

agf
Aug 2nd, 2011
713
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.35 KB | None | 0 0
  1. from __future__ import with_statement
  2. from heapq import heapify, heappop, heappushpop
  3. from contextlib import closing
  4. from itertools import count
  5. from mmap import mmap
  6. try:
  7.     from collections import Counter
  8. except:
  9.     def Counter(iterable):
  10.         frequencies = __import__('collections').defaultdict(int)
  11.         for item in iterable:
  12.             frequencies[item] += 1
  13.         return frequencies
  14.  
  15. class BitWriter(object):
  16.     """Accepts bitstrings and writes to a file-like object"""
  17.     def __init__(self, bitfile, codes = None):
  18.         self.bitfile, self.remainder, self.codes = bitfile, '', codes
  19.     @staticmethod
  20.     def tobytes(bits):
  21.         return (chr(int(bits[i:i+8], 2)) for i in range(0, len(bits), 8))
  22.     def write(self, bits):
  23.         bits, self.remainder = self.remainder + bits, ''
  24.         bytes = len(bits) // 8
  25.         if not bytes:
  26.             self.remainder = bits
  27.             return
  28.         fullbits = bytes * 8
  29.         self.remainder, bits = bits[fullbits:], bits[:fullbits]
  30.         self.bitfile.write(''.join(self.tobytes(bits)))
  31.     def bitpad(self):
  32.         """Smart bitpadder - Uses a sequence that isn't a valid code to bitpad
  33.           This is an alternative to a prepended bit signifying padding
  34.           It doesn't require the reader know anything about the bitpadding"""
  35.         tail, pad = -len(self.remainder) % 8, ''
  36.         if self.codes:
  37.             for pad in (bin(i)[2:].zfill(tail) for i in range(2**tail)):
  38.                 if all(not pad.startswith(bitcode) for bitcode in self.codes()):
  39.                     break
  40.         if tail and not pad:
  41.             print "Possible code collision with zero padding"
  42.         self.write(pad or '0' * tail)
  43.  
  44. class BitReader(object):
  45.     """Reads from a file-like object and returns bitstrings"""
  46.     def __init__(self, bitfile):
  47.         self.bitfile, self.remainder = bitfile, ''
  48.     @staticmethod
  49.     def tobits(bytestring):
  50.         for c in bytestring:
  51.             yield bin(ord(c))[2:].zfill(8)
  52.     def __iter__(self):
  53.         return iter(lambda: self.read(1), '')
  54.     def read(self, bits=None):
  55.         remainder = self.remainder
  56.         if bits is None:
  57.             self.remainder, data = '', remainder
  58.             return data + ''.join(self.tobits(self.infile.read()))
  59.         rlen = len(remainder)
  60.         if bits <= rlen:
  61.             self.remainder, data = remainder[bits:], remainder[:bits]
  62.             return data
  63.         bits -= rlen
  64.         numbytes, extra = divmod(bits, 8)
  65.         data = ''.join(self.tobits(self.bitfile.read(numbytes + (not not extra))))
  66.         self.remainder, data = data[bits:], remainder + data[:bits]
  67.         return data
  68.  
  69. class BitFileReader(BitReader):
  70.     """BitReader that opens and closes its file"""
  71.     def __init__(self, filename):
  72.         self.filename, self.remainder = filename, ''
  73.     def open(self):
  74.         self.bitfile = open(self.filename, 'rb')
  75.         return self
  76.     def close(self, *args):
  77.         self.bitfile.close()
  78.     __enter__ = open
  79.     __exit__ = close
  80.  
  81. class BitFileWriter(BitWriter):
  82.     """BitWriter that opens and closes its file and bitpads itself"""
  83.     def __init__(self, filename, codes = None):
  84.         self.filename, self.remainder, self.codes = filename, '', codes
  85.     def open(self):
  86.         self.bitfile = open(self.filename, 'wb')
  87.         return self
  88.     def close(self, *args):
  89.         self.bitpad()
  90.         self.bitfile.close()
  91.     __enter__ = open
  92.     __exit__ = close
  93.  
  94.  
  95.  
  96. def codetree(node):
  97.     """Codes a node, and calls itself on the node's children"""
  98.     for i, child in enumerate(node[1:3]):
  99.         child[3] = node[3] + str(i)
  100.         if isinstance(child[1], list):
  101.             codetree(child)
  102.  
  103. def codedata(indata):
  104.     """Creates a mapping of byte to code by
  105.       1. generating a heap of nodes
  106.       2. turning them into a tree by frequency
  107.       3. recursively coding the tree
  108.    """
  109.     index = count().next
  110.     heap = [[freq, index(), byte, ''] for byte, freq in Counter(indata).iteritems()]
  111.     heapify(heap)
  112.     tempheap = heap[:]
  113.     head = heappop(tempheap)
  114.     while tempheap:
  115.         temp = heappop(tempheap)
  116.         head = heappushpop(tempheap, [head[0] + temp[0], head, temp, ''])
  117.     codetree(head)
  118.     return Codec(node[2:] for node in heap)
  119.  
  120. def codefile(filename):
  121.     """Open a file, mmap it, and code it"""
  122.     with open(filename, 'r+b') as infile:
  123.         with closing(mmap(infile.fileno(), 0)) as indata:
  124.             return codedata(indata)
  125.  
  126. def loadcodetable(codetablefilename):
  127.     """Load a table from a file
  128.       Correctly interprets newlines and nul bytes
  129.       CSV Can't do nul bytes and shelve is awkward and overkill"""
  130.     table, byte = Codec(), False
  131.     with open(codetablefilename, 'rb') as codetablefile:
  132.         for line in codetablefile:
  133.             if not byte:
  134.                 byte, line = line[0], line[1:]
  135.             if not line:
  136.                 continue
  137.             table[byte] = line[:-1]
  138.             byte = False
  139.     return table
  140.  
  141. def fliptable(table):
  142.     """Flip a table, works as expected if the values are unique"""
  143.     table.update([(value, key) for key, value in (table.popitem() for Null in range(len(table)))])
  144.     return table
  145.  
  146. def keysarebytes(table):
  147.     """Determing if a table is a bytetable or codetable"""
  148.     return all(len(key) == 1 for key in table)
  149.  
  150. def dumptable(table, tablefilename):
  151.     """Dump a table to a file
  152.       Uses a custon one item per line format"""
  153.     outtable = table if keysarebytes(table) else fliptable(table)
  154.     with open(tablefilename, 'wb') as tablefile:
  155.         for byte, code in outtable.iteritems():
  156.             tablefile.write(byte + code + '\n')
  157.     return table
  158.  
  159. def encode(bytetable, indata):
  160.     """Encode a string-like object by looking up each byte in a table"""
  161.     for byte in indata:
  162.         yield bytetable[byte]
  163.  
  164. def decode(codetable, encodeddata, code=''):
  165.     """Decode a bitstring-like object by reading bits until a code is found
  166.       then yielding the corresponding value and repeating"""
  167.     for bit in encodeddata:
  168.         code += bit
  169.         if code in codetable:
  170.             yield codetable[code]
  171.             code = ''
  172.  
  173. def writeencodefile(bytetable, infilename, encodedfilename):
  174.     """Read unencoded data from a file and encode it to another file"""
  175.     with BitFileWriter(encodedfilename, bytetable.itervalues) as encodedfile:
  176.         with open(infilename, 'r+b') as infile:
  177.             with closing(mmap(infile.fileno(), 0)) as indata:
  178.                 for code in encode(bytetable, indata):
  179.                     encodedfile.write(code)
  180.     return bytetable
  181.  
  182. def writedecodefile(codetable, encodedfilename, decodedfilename):
  183.     """Decode an encoded file and save the decoded version"""
  184.     with BitFileReader(encodedfilename) as encodeddata:
  185.         with open(decodedfilename, 'wb') as decodedfile:
  186.             for byte in decode(codetable, encodeddata):
  187.                 decodedfile.write(byte)
  188.     return codetable
  189.  
  190.  
  191.  
  192.  
  193. class Codec(dict):
  194.     """Given raw data, creates a mapping of bytes to codes
  195.       Given a mapping of bytes to codes, Huffman encodes binary data
  196.       Given a mapping of codes to bytes, decodes Huffman encoded binary data
  197.       Includes utility functions for file and dictionary operations"""
  198.     codedata = staticmethod(codedata)
  199.     codefile = staticmethod(codefile)
  200.     loadcodetable = staticmethod(loadcodetable)
  201.     fliptable = fliptable
  202.     keysarebytes = keysarebytes
  203.     dumptable = dumptable
  204.     encode = encode
  205.     decode = decode
  206.     writeencodefile = writeencodefile
  207.     writedecodefile = writedecodefile
  208.  
  209. __all__ = ['Codec', 'BitFileReader', 'BitFileWriter', 'BitReader', 'BitWriter']
  210.  
  211.  
  212.  
  213. def test():
  214.     filename = __file__
  215.     with open(filename, 'rb') as infile:
  216.         indata = infile.read()
  217.  
  218.     # Test Functional
  219.     # Everything is either a generator or returns a Codec
  220.     # (or whatever dict was passed in)
  221.     (codefile(filename).dumptable(filename + '.codetable').
  222.         writeencodefile(filename, filename + '.encoded').
  223.             loadcodetable(filename + '.codetable').fliptable().
  224.                 writedecodefile(filename + '.encoded', filename + '.decoded'))
  225.     with open(filename + '.decoded', 'rb') as outfile:
  226.         assert indata == outfile.read()
  227.  
  228.     # Test Procedural
  229.     # Everything works fine called unbound
  230.     # They'll all take regular dicts too
  231.     writeencodefile(dumptable(codefile(filename), filename + '.codetable'),
  232.                                 filename, filename + '.encoded')
  233.     writedecodefile(fliptable(loadcodetable(filename + '.codetable')),
  234.                                 filename + '.encoded', filename + '.decoded')
  235.     with open(filename + '.decoded', 'rb') as outfile:
  236.         assert indata == outfile.read()
  237.  
  238.     # Test OO
  239.     # You can also call Codec(byte_or_codetable) with a normal dict
  240.     codec = Codec.codefile(filename)
  241.     codec.writeencodefile(filename, filename + '.encoded')
  242.     codec.dumptable(filename + '.codetable')
  243.     codec = Codec.loadcodetable(filename + '.codetable')
  244.     codec.fliptable()
  245.     codec.writedecodefile(filename + '.encoded', filename + '.decoded')
  246.     with open(filename + '.decoded', 'rb') as outfile:
  247.         assert indata == outfile.read()
  248.  
  249.     print filename, "successfully compressed and decompressed."
  250.  
  251. if __name__ == '__main__':
  252.     #__import__('cProfile').run("test()")
  253.     test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement