BASIC (De)Tokenizer

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""CBM BASIC V2 (de)tokenization.

:todo: Repeat syntax in escape codes like ``{017*3}``.
:todo: Unicode translation of escape codes and characters to get sources
    looking more like the original.  ``£`` instead of a backslash,
    ``←`` instead of ``_`` and so on.
:todo: Choose between lower/upper case character sets.
:todo: Allow keyword escape sequences like ``{print}``.
:todo: Option to list keywords as escape sequences.
:todo: Documentation.
:todo: Let user set start address.
:todo: Factor out escape sequence delimiters as constants.
:todo: Options to list keywords and escape sequence names.
:todo: Warn if line numbers are not in ascending order.
"""
import logging
import os
import re
import sys
from functools import partial
from itertools import imap
from optparse import OptionParser


__all__ = ['detokenize', 'tokenize']
__author__ = "Marc 'BlackJack' Rintsch (marc[at]rintsch[dot]de)"
__version__ = '0.0.0'

logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger(os.path.basename(__file__))

#:
#: Token value of a keyword in the list is the keyword's index + 128.
#:
BASIC_V2_TOKENS = [
    'end', 'for', 'next', 'data', 'input#', 'input', 'dim', 'read', 'let',
    'goto', 'run', 'if', 'restore', 'gosub', 'return', 'rem', 'stop', 'on',
    'wait', 'load', 'save', 'verify', 'def', 'poke', 'print#', 'print', 'cont',
    'list', 'clr', 'cmd', 'sys', 'open', 'close', 'get', 'new', 'tab(', 'to',
    'fn', 'spc(', 'then', 'not', 'step', '+', '-', '*', '/', '^', 'and', 'or',
    '>', '=', '<', 'sgn', 'int', 'abs', 'usr', 'fre', 'pos', 'sqr', 'rnd',
    'log', 'exp', 'cos', 'sin', 'tan', 'atn', 'peek', 'len', 'str$', 'val',
    'asc', 'chr$', 'left$', 'right$', 'mid$', 'go'
]
#:
#: Mapping of keyword to token value.
#:
KEYWORD2VALUE = dict(
    (t, chr(v)) for v, t in enumerate(BASIC_V2_TOKENS, 128)
)
#:
#: Regular expression for tokenizing the part after the line number.
#:
#: Matches all keywords, strings, and escape sequences outside strings.
#:
TOKENIZE_RE = re.compile(
    '|'.join(
        re.escape(k) for k in sorted(BASIC_V2_TOKENS, key=len, reverse=True)
    )
    + r'|"[^"]*"|\{[^\}]*\}'
)
#:
#: Regular expression to match line numbers at the beginning of a line.
#:
LINE_NUMBER_RE = re.compile(r'\s*\d+')

ESCAPE_NAMES = [
    (
        0,
        [
            'null', 'ctrl a', 'ctrl b', 'ctrl c', 'ctrl d', 'white', 'ctrl f',
            'ctrl g', 'up/lo lock on', 'up/lo lock off', 'ctrl j', 'ctrl k',
            'ctrl l', 'return', 'lower case', 'ctrl o', 'ctrl p', 'down',
            'reverse on', 'home', 'delete', 'ctrl u', 'ctrl v', 'ctrl w',
            'ctrl x', 'ctrl y', 'ctrl z', 'esc', 'red', 'right', 'green', 'blue'
        ]
    ),
    (92, 'pound'),
    (95, 'arrow left'),
    (129, 'orange'),
    (
        133,
        ['f1', 'f3','f5', 'f7', 'f2', 'f4', 'f8', 'shift return', 'upper case']
    ),
    (
        144,
        [
            'black', 'up', 'reverse off', 'clear', 'insert', 'brown',
            'light red', 'dark gray', 'gray', 'light green', 'light blue',
            'light gray', 'purple', 'left', 'yellow', 'cyan', 'shift space',
            'cbm k', 'cbm i', 'cbm t', 'cbm @', 'cbm g', 'cbm +', 'cbm m',
            'cbm pound', 'shift pound', 'cbm n', 'cbm q', 'cbm d', 'cbm z',
            'cbm s', 'cbm p', 'cbm a', 'cbm e', 'cbm r', 'cbm w', 'cbm h',
            'cbm j', 'cbm l', 'cbm y', 'cbm u', 'cbm d', 'shift @', 'cbm f',
            'cbm c', 'cbm x', 'cbm v', 'cbm b', 'shift asterisk'
        ]
    ),
    (219, ['shift +', 'cbm -', 'shift -']),
    (223, 'cbm asterisk'),
    (255, 'pi'),
]

def _prepare_escape_name_mapping():
    result = dict()
    for value, names in ESCAPE_NAMES:
        if isinstance(names, basestring):
            names = [names]
        result.update((n, chr(v)) for v, n in enumerate(names, value))
    return result

ESCAPE_NAME_TO_VALUE = _prepare_escape_name_mapping()

VALUE_TO_UNICODE = {
    '\x5c': u'£',
    '\x5e': u'↑',
    '\x5f': u'←',
    '\xff': u'π',
}

def _prepare_unicode_escape_mapping():
    value2escape = dict((v, e) for e, v in ESCAPE_NAME_TO_VALUE.iteritems())
    result = dict()
    for value, unichar in VALUE_TO_UNICODE.iteritems():
        escape = value2escape.get(value)
        result[ord(unichar)] = u'{%s}' % escape if escape else unicode(value)
    return result

UNICODE_TO_ESCAPE = _prepare_unicode_escape_mapping()


class Error(Exception):
    pass


class TokenizeError(Error):
    pass


class Detokenizer(object):
    def __init__(self):
        self.is_in_string_mode = False
        #
        # Base mapping: Byte value to numeric escape sequence.
        #
        self.string_mapping = dict((chr(i), '{%03d}' % i) for i in xrange(256))
        #
        # Range 32--64 is a 1:1 mapping.
        #
        self.add_range(32, 64, 32)
        #
        # Unshiftet characters are mapped to lower case ASCII characters.
        #
        self.add_range(65, 90, 97)
        #
        # Shifted characters are mapped to upper case ASCII characters.
        #
        self.add_range(97, 127, 65)
        #
        # Values above 191 are copies from lower ranges.
        #
        self.copy_range(192, 233, 96)
        self.copy_range(224, 254, 160)
        self.copy_range(255, 255, 126)
        #
        # Add symbolic names.
        #
        for name, value in ESCAPE_NAME_TO_VALUE.iteritems():
            self.string_mapping[value] = '{%s}' % name

        self.code_mapping = dict(
            (chr(i), kw) for i, kw in enumerate(BASIC_V2_TOKENS, 128)
        )

    def add_range(self, start, end, value_start):
        offset = value_start - start
        self.string_mapping.update(
            (chr(i), chr(i + offset)) for i in xrange(start, end + 1)
        )

    def copy_range(self, to_start, to_end, from_start):
        offset = from_start - to_start
        self.string_mapping.update(
            (chr(i), self.string_mapping[chr(i + offset)])
            for i in xrange(to_start, to_end + 1)
        )

    def decode(self, value):
        if value == '"':
            self.is_in_string_mode = not self.is_in_string_mode
        if not self.is_in_string_mode:
            try:
                return self.code_mapping[value]
            except KeyError:
                pass    # Intentionally ignored.
        return self.string_mapping[value]

    def detokenize_line(self, line):
        self.is_in_string_mode = False
        return ''.join(imap(self.decode, line))

    def detokenize(self, basic_prg, offset=2):
        i = offset
        while get_uint16(basic_prg, i) != 0:
            i += 2  # Skip pointer to next BASIC line.
            line_number = get_uint16(basic_prg, i)
            i += 2  # Skip line number.
            j = basic_prg.index('\0', i)
            tokenized_line = basic_prg[i:j]
            yield (line_number, self.detokenize_line(tokenized_line))
            i = j + 1   # Skip to next line.


def get_uint16(data, offset):
    return ord(data[offset]) | ord(data[offset + 1]) << 8


def int2uint16(value):
    if not (0 <= value < 2**16):
        raise ValueError('integer not in range 0..65535')
    high, low = divmod(value, 256)
    return chr(low) + chr(high)


def detokenize(basic_prg, offset=2):
    return Detokenizer().detokenize(basic_prg, offset)


def tokenize_escape(name):
    try:
        return ESCAPE_NAME_TO_VALUE[name]
    except KeyError:
        return chr(int(name))


def tokenize_raw_string(string):
    result = list()
    for value in string:
        tmp = ord(value)
        if tmp == 126:
            value = '\xff'
        elif 'a' <= value <= 'z':
            value = value.upper()
        elif 'A' <= value <= 'Z':
            value = chr(ord(value.lower()) + 96)
        result.append(value)
    return ''.join(result)


def _tokenize(regex, process_match, process_rest, string):
    result = list()
    i = j = 0
    for match in regex.finditer(string):
        j = match.start()
        if i != j:
            result.append(process_rest(string[i:j]))
        i = match.end()
        data, finished = process_match(match)
        result.append(data)
        if finished:
            i = len(string)
            break
    if i != len(string):
        result.append(process_rest(string[i:]))
    return ''.join(result)


tokenize_string = partial(
    _tokenize,
    re.compile(r'\{[^\}]*\}'),
    lambda m: (tokenize_escape(m.group()[1:-1]), False),
    tokenize_raw_string
)


def find_command_separator(line, offset):
    for match in re.compile(r'"[^"]*?"|:').finditer(line, offset):
        if match.group() == ':':
            return match.end()
    return len(line)


def tokenize_line_content(line):
    def process_match(match):
        result = list()
        data = match.group()
        try:
            result.append(KEYWORD2VALUE[data])
        except KeyError:
            if data.startswith('"'):
                result.append(tokenize_string(data))
            elif data.startswith('{'):
                result.append(tokenize_escape(data[1:-1]))
            else:
                assert False
        else:
            if data == 'rem':
                result.append(tokenize_string(line[match.end():]))
            elif data == 'data':
                next_command_offset = find_command_separator(line, match.end())
                result.append(
                    tokenize_string(line[match.end():next_command_offset])
                )
                result.append(tokenize_line_content(line[next_command_offset:]))
        return (''.join(result), data in ['rem', 'data'])

    return _tokenize(TOKENIZE_RE, process_match, lambda s: s.upper(), line)


def tokenize_line(line):
    line_number_match = LINE_NUMBER_RE.match(line)
    if not line_number_match:
        raise TokenizeError('no line number found: %r' % line)
    line_number = int(line_number_match.group())
    if line_number >= 2**16:
        raise TokenizeError('line number too high (%d)' % line_number)
    if line_number >= 64000:
        LOG.warn('line number %d > 64000', line_number)
    line = line[line_number_match.end():].lstrip().rstrip('\r\n')
    if not line.strip():
        LOG.warn('empty line (%d)', line_number)
    return int2uint16(int(line_number)) + tokenize_line_content(line)


def tokenize(lines, address=0x0801):
    result = [int2uint16(address)]
    for line in lines:
        if line.strip():
            tokenized_line = tokenize_line(line)
            address += len(tokenized_line) + 3
            result.append(int2uint16(address) + tokenized_line + '\0')
    result.append(int2uint16(0))
    return ''.join(result)


def ascii_to_utf8(string):
    def replace_func(match):
        escape = match.group()
        return VALUE_TO_UNICODE.get(tokenize_escape(escape[1:-1]), escape)
    return re.sub(r'\{[^}]+\}', replace_func, string).encode('utf-8')


def utf8_to_ascii(string):
    return string.decode('utf-8').translate(UNICODE_TO_ESCAPE).encode('ascii')


def main():
    parser = OptionParser(
        usage='%prog [options] detokenize|tokenize',
        version=__version__,
        description='CBM BASIC V2 (de)tokenizer.',
        epilog='Written by %s.' % __author__
    )
    parser.add_option('-i', '--input', metavar='FILE', action='store',
        default=None, help='input filename (default: <stdin>)')
    parser.add_option('-o', '--output', metavar='FILE', action='store',
        default=None, help='ouput filename (default: <stdout>)')
    parser.add_option('--unicode', action='store_true', default=False,
        help='convert certain charaters and escape sequences to/from unicode.'
            ' The encoding is UTF-8.')
    parser.add_option('--debug', action='store_true', default=False,
        help='activate debugging output like complete stack traces')

    options, arguments = parser.parse_args()

    if options.debug:
        LOG.setLevel(logging.DEBUG)

    if not arguments or arguments[0] not in ['detokenize', 'tokenize']:
        parser.error('please choose operation mode')

    try:
        in_file = open(options.input, 'rb') if options.input else sys.stdin
        out_file = open(options.input, 'wb') if options.output else sys.stdout

        command = arguments[0]
        if command == 'detokenize':
            for line_number, line in detokenize(in_file.read()):
                if options.unicode:
                    line = ascii_to_utf8(line)
                out_file.write('%5d %s\n' % (line_number, line))
        elif command == 'tokenize':
            if options.unicode:
                in_file = imap(utf8_to_ascii, in_file)
            out_file.write(tokenize(in_file))
        else:
            assert False, 'command %r should not be possible' % command
    except Exception, error:
        if options.debug:
            raise
        else:
            parser.error(error)


if __name__ == '__main__':
    main()