Flashback CT Codec

#!/usr/bin/env python

## Flashback CT codec.
## Written using Python 3.4.3 by butchering a different project.
## Not the best, but works.

def check(file):
    """Returns 0 if CRC is correct.
    Otherwise, returns difference in crc."""
    from array import array
    crc = int.from_bytes(file[-8:-4], 'little')
    a = array("L", file[:-8])
    for i in a:
        crc ^= i
    return crc

def dec(file):
    def getBits(s):
        from array import array
        s = array("L", s)
        s.byteswap()
        c, bits, f = 0, 0, 0
        for i in reversed(s):
            i |= f
            f = 0x100000000
            while i != 1:
                while not bits:
                    bits = yield c
                    c = 0
                bits -= 1
                c <<= 1
                c |= i & 1
                i >>= 1
        else:
            # Dirty, dirty hack.
            if not bits:
                yield c

    pos = int.from_bytes(file[-4:], 'big')
    # crc = int.from_bytes(file[-8:-4], 'big')
    out = bytearray(pos)
    src = getBits(file[:-8])
    next(src)
    pos -= 1

    while pos >= 0:
        if not src.send(1):
            if not src.send(1):
                l = src.send(3) + 1
##                print("\t{:6X} at 0x{:<6X} - 0x{:<6X}".format(l, pos-l, pos))
                for j in range(l):
                    out[pos] = src.send(8)
                    pos -= 1
                continue
            l, b = 2, src.send(8)
        else:
            b = src.send(2)
            if b == 3:
                l = src.send(8) + 9
                for j in range(l):
                    out[pos] = src.send(8)
                    pos -= 1
                continue
            if b == 2:
                l = src.send(8) + 1
                b = src.send(12)
            else:
                l = 3 + b
                b = src.send(9 + b)
##        print("@ {:6X} from +{:06X} to 0x{:<6X} - 0x{:<6X}".format(l, b, pos-l, pos))
        for j in range(l):
            out[pos] = out[pos + b]
            pos -= 1
    return bytes(out)

def cmpBackward(data):
    """Returns a [list] of LZ hits within data.
    This can be converted into binary form via cmpMass().

    The data is searched back-to-front for duplicated segments.
    These are encoded as tuples in one of these formats:
        ('u', bytes, length)    uncompressed segment
            bytes is bytearray of data to write
            length should be len(bytes)

        ('c', back, length, cmdlen) compressed segment
            back is #bytes from cur.position backward to hit (actual)
            length is detected length of segment as to be written
            cmdlen is a count of #cmds required for entry, used as correction to length

        ('e',)  end of list
    """
    data = bytes(reversed(data))
    pos = len(data)
    tbl = []
    if not pos:
        return tbl
    u = 0
    while pos>0:
        o = 1
        h = data.rfind(data[pos-o:pos], 0, max(0,pos-1)) + 1
        if not h:
            u+=1
            pos-=1
        else:
            # b is last good hit, g is last good hit length
            b, g, l, p = 0, -1, 0, 0
            # I'll change this later.  Better method would be to test exponential growth, then test back exponentially until length.
            while (l+o) < (pos-1):
                # e is minimum length for command based on hit
                e = 0
                i = l + o
                j = pos - l - o - h
                if i == 2 and j < 0x100:
                    e = 1
                elif i == 3 and j < 0x200:
                    e = 2
                elif i == 4 and j < 0x400:
                    e = 2
                elif i < 0x101 and j < 0x1000:
                    e = 3
                else:
                    break
                # If l changed, you'll have to recalc
                if l > (l+o-e):
                    l = l+o-e
                    o = e
                    continue
                # Only set when expressible!
                if l>=0:
                    b, g = h, l
                    p = o if o>e else e
                if e<=o:
                    l+=1
                else:
                    o = e

                # Do the search
                h = data.rfind(data[pos-l-o:pos], 0, max(0,pos-1)) + 1
                if not h:
                    break

            # pos - length is duplicated segment
            l = g+p

##                if g==0 and u:
##                    # If length of cmd same as uncompressed and already have uncompressed, extend it.
##                    u+=1
##                    pos-=1
##                elif l<2 or g<0:
            if l<2 or g<0:
                # Catch unexpressable hits.
                u+=1
                pos-=1
            else:
                if u:
                    # Flush any uncompressed data first.
                    tbl.append(('u', data[pos:pos+u], u))
                    u = 0
                # Hit is start position of segment - hit location (relative)
                pos-=l
                tbl.append(('c', pos-b+1, l))
    # Stream always has to start with uncompressed data.
    u += pos
    tbl.append(('u', data[0:u], u))
    tbl.reverse()
    tbl.append(('e',))
    return tbl

def compilecmp(cmp, org_sz):
    def bitPush(org_sz):
        from array import array
        a = array("L")
        cur = 1
        while True:
            val, sz = yield()
            if val is None or sz is None:
                break
            for x in range(sz):
                cur <<= 1
                cur |= val & 1
                val >>= 1
                if cur & 0x100000000:
                    a.append(cur & 0xFFFFFFFF)
                    cur = 1
        # Push final field and compute checksum.
        a.append(cur)
        crc = 0
        # Could use itertools.accumulate to run this...
        for x in a:
            crc ^= x
        a.append(crc)
        a.append(org_sz)
        a.byteswap()
        yield(a.tobytes())

    push = bitPush(org_sz)
    next(push)
    count = 0
    for i in reversed(cmp):
        if i[0] is 'u':
            s, l = i[1], i[2]
            p = -1
            count += i[2] << 3
            while l > 0:
                if l<8: # 0.0.xxx       write 1 + getBits(3) bytes from source  [5 + 8c]
                    for k in range(min(l, 8)):
                        push.send((s[p], 8))
                        p -= 1
                    push.send((min(l, 8) - 1, 3))
                    push.send((0, 2))
                    count += 5
                    l -= 8
                else:   # 1.11.xxxxxxxx write 9 + getBits(8) bytes from source  [11 + 8c]
                    for k in range(min(l, 108)):
                        push.send((s[p], 8))
                        p -= 1
                    push.send((min(l, 108) - 9, 8))
                    push.send((7, 3))
                    count += 11
                    l -= 108
        elif i[0] is 'c':
            l, s = i[1], i[2]
            if s == 2 and l < 0x100:    # 0.1.xxxxxxxx  copy 2 bytes from out+getBits(8)    [10]
                push.send((l, 8))
                push.send((1, 2))
                count += 10
            elif s == 3 and l < 0x200:  # 1.00.xxxxxxxxx    copy 3 bytes from out+getBits(9)    [12]
                push.send((l, 9))
                push.send((4, 3))
                count += 12
            elif s == 4 and l < 0x400:  # 1.01.xxxxxxxxxx   copy 4 bytes from out+getBits(10)   [13]
                push.send((l, 10))
                push.send((5, 3))
                count += 13
            elif s <= 0x100 and l < 0x1000: # 1.10.xxxxxxxx.xxxxxxxxxxxx    copy getBits(8)+1 bytes from out+getBits(12)    [23]
                push.send((l, 12))
                push.send((s - 1, 8))
                push.send((6, 3))
                count += 23
            else:
                raise ValueError("Unable to encode copy of 0x%X bytes from 0x%X." % (s, l))
    print(count)
    return push.send((None, None))

def main():
    with open(r"LEVEL1.UCT", 'rb') as f:
        org = f.read()
    cmd = cmpBackward(org)
    print(cmd)
    out = compilecmp(cmd, len(org))
    with open(r"test.CT", 'wb') as f:
        f.write(out)

    test = dec(out)
    print(dec(out) == org)
    with open(r"test.UCT", 'wb') as f:
        f.write(test)


if __name__ == '__main__':
    main()