DQ4 script dumper

#!/usr/bin/python
# -*- coding: utf_8 -*-

from __future__ import print_function
import collections, re, struct, sys, unicodedata


if str is bytes:
    # python2
    def nextmethod(i):
        return iter(i).next
else:
    # python3
    def nextmethod(i):
        return iter(i).__next__


def rompos(bank, address):
    """Return the absolute ROM position for a bank and CPU address.
    Arguments:
    bank: 16KB ROM bank number
    address: address in CPU address space; must be between 0x8000 and 0xbfff
    """
    return bank * 0x4000 + address - 0x8000


def pointers(data, address, count):
    """Return a list of pointers extracted from the ROM.
    Arguments:
    data: the ROM data
    address: start of pointer table in CPU address space; assumed to be in bank 0x16
    count: number of pointers to extract
    """
    return struct.unpack_from("<%dH" % count, data, rompos(0x16, address))


class Octets(collections.Iterator):
    """Iterator that yields octets from the ROM starting at the given
    bank and address, straddling banks as necessary.

    Constructor arguments:
    data: the ROM data
    bank: the logical ROM bank (0-5) 5 is translated to physical bank 0x1b
    ptr:  start of stream in CPU address space

    Public attributes:
    pos:  absolute ROM position of last byte yielded, or start position if
          no bytes yielded yet. This attribute is read-only.
    """
    def __init__(self, data, bank, ptr):
        self._data = data
        self._offsets = self._offsetgen(bank, ptr)
        self.pos = rompos(0x1b if bank == 5 else bank, ptr)

    def __next__(self):
        self.pos = next(self._offsets)
        return self._data[self.pos]
    next = __next__ # for python2

    @staticmethod
    def _offsetgen(bank, ptr):
        banks = ((0,    0x8000),
                 (1,    0x8000),
                 (2,    0x8000),
                 (3,    0x8000),
                 (4,    0x8000),
                 (0x1b, 0xa500))

        b, start = banks[bank]
        if not (start <= ptr < 0xbfd8):
            raise ValueError("WTF? Pointer out of range")

        for i in range(rompos(b, ptr), rompos(b, 0xbfd8)):
            yield i

        for b, start in banks[bank+1:]:
            for i in range(rompos(b, start), rompos(b, 0xbfd8)):
                yield i


def sextets(octet):
    """Generate sextets from a sequence of octets.

    In:
    7      0 7      0 7      0
    AAAAAAAA BBBBBBBB CCCCCCCC

    Out:
    5    05     05     05    0
    aaaaaabb bbbbcccc ccdddddd

    Arguments:
    octet: any iterable that yields bytes
    """
    octet = iter(octet)
    try:
        while 1:
            x = next(octet)
            yield x >> 2
            x = x << 8 | next(octet)
            yield x >> 4 & 0x3f
            x = x << 8 | next(octet)
            yield x >> 6 & 0x3f
            yield x & 0x3f
    except StopIteration:
        return


def expander(lut, substring):
    """Return a generator that expands sextets via the LUT and substring dictionary.
    Arguments:
    lut:       128-entry 6-bit to 8-bit lookup table
    substring: 192-entry substring dictionary
    Returns:
    A generator that takes a sequence of sextets and yields expanded bytes.
    """
    def generator(sextet):
        kanatype = 0
        sextet = nextmethod(sextet)
        for byte in iter(sextet, 0x39):
            if byte < 0x3c:            # 0-0x3b: single character from table
                x = lut[kanatype + byte]
                if 0: # if x == 0xf0:
                    for i in substring[0]:
                        yield i
                else:
                    yield x
            elif byte == 0x3c:         # 0x3c: switch tables
                kanatype ^= 0x40
            else:                      # 0x3d-0x3f: substring
                try:
                    x = (byte - 0x3d) << 6 | sextet()
                except StopIteration:
                    raise RuntimeError("WTF? Unexpected end of sextet stream")
                for i in substring[x]:
                    yield i

        # iter(o, sentinel) cuts off the stop value, so yield it manually
        yield lut[kanatype + 0x39]
    return generator


def decode(expanded):
    """Return a string decoded from a sequence of expanded bytes.
    """
    # U+3099 COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
    # U+309A COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
    charset = (u"　０１２３４５６７８９あいうえお"
               u"かきくけこさしすせそたちつてとな"
               u"にぬねのはひふへほまみむめもやゆ"
               u"よらりるれろわをんっゃゅょ\u3099\u309a。"
               u"アイエオカキクコサシスソタテトナ"
               u"ニヌネノハヒフホマミムメモラルレ"
               u"ロンャッドーじだどぶ゜゛．？！「"
               u"＊：…十骨ＭＰＸＡＢＣＤＥＦＧＨ")
    maxc = len(charset)

    ret = []
    append = ret.append
    expanded = iter(expanded)
    for byte in expanded:
        if byte in (0x3d, 0x3e):
            # move dakuten/handakuten after the next character
            try:
                append(charset[next(expanded)])
            except StopIteration:
                break
        # dump nonprintable characters (control codes, etc.) as hex
        append(charset[byte] if byte < maxc else u"<%02X>" % byte)
        if byte in (0xfb, 0xfc):
            append(u"\n")
    return unicodedata.normalize("NFC", u"".join(ret))


def prettify():
    # use a regular expression to find places to fix:
    #
    # katakana followed by べぺり (but not へ, since it may be a particle)
    # へべぺり followed by katakana or ー (this catches ヘ in the middle of a word)
    # フ followed by ア or エ (make ファ or フェ with small katakana)
    # リリパット (monster name, the rules above don't catch the first リ)
    # イエティ, ガーディアン (monster names wanting small ィ)
    sub = re.compile(u"[ア-ン][べぺり]|[へべぺり][ア-ー]|フ[アエ]|りりパ|エテイ|デイア").sub

    # use builtin str.translate() to convert hiragana to katakana
    try:
        trans = str.maketrans
    except AttributeError: # python2 doesn't have unicode-capable maketrans()
        def trans(src, dest):
            return {ord(x): ord(y) for x, y in zip(src, dest)}
    trans = trans(u"へべぺり", u"ヘベペリ")

    # use a dictionary to do the other fixes, and to override some conversions
    specialfix = {u"へー":u"へー", # leave へ alone in でへへー, etc.
                  u"りア":u"りア", # leave り alone in さそりアーマー
                  u"エテイ":u"エティ",
                  u"デイア":u"ディア",
                  u"フア":u"ファ",
                  u"フエ":u"フェ"}.get

    # wrap the fixes in a function that can be passed to re.sub()
    def fix(x):
        x = x.group(0)
        return specialfix(x, x.translate(trans))

    # finally, wrap everything in a lambda closure
    # so all the above setup is only done once
    return lambda x: sub(fix, x)
prettify = prettify()


def main():
    try:
        fn = sys.argv[1]
    except (IndexError, ValueError):
        sys.exit("Usage: dq4text filename")

    # read the ROM, discarding the .nes header
    with open(fn, "rb") as f:
        f.seek(0x10)
        data = bytearray(f.read())

    # extract the 6-bit to 8-bit lookup tables (64 hiragana + 64 katakana)
    pos = rompos(0x16, 0x8765)
    lut = data[pos:pos+128]

    # extract the 192-entry substring dictionary
    # substrings are plain byte strings (not sextet-packed) and end with 0xfe
    # like the main script, there is one pointer per 32 substrings
    substring = []
    for ptr in pointers(data, 0x87e5, 6):
        n = nextmethod(Octets(data, 5, ptr))
        substring.extend([bytearray(iter(n, 0xfe)) for i in range(32)])

    # set up the sextet decoder
    expand = expander(lut, substring)

    # extract the bank bins
    pos = rompos(0x16, 0x8960)
    bankbins = data[pos:pos+5]

    bank = 0
    dupeptr = set()
    for i, ptr in enumerate(pointers(data, 0x88b0, 0x58)):
        if i in bankbins:
            bank += 1

        if (bank, ptr) in dupeptr:
            continue
        dupeptr.add((bank, ptr))

        octet = Octets(data, bank, ptr)
        sextet = sextets(octet)

        # there are a few "holes" in the script
        # (pointers with fewer than 32 valid strings after them)
        count = {4: 11,
                15: 27,
                87: 3}.get(i, 32)

        start = octet.pos
        for j in range(i * 32, i * 32 + count):
            output = prettify(decode(expand(sextet)))
            end = octet.pos
            print("$%03X ($%05X-$%05X):" % (j, start, end))
            print(output)
            start = end


if __name__ == "__main__":
    main()