Advertisement
Marrin

BASIC (De)Tokenizer

Aug 29th, 2013
300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.07 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """CBM BASIC V2 (de)tokenization.
  4.  
  5. :todo: Repeat syntax in escape codes like ``{017*3}``.
  6. :todo: Unicode translation of escape codes and characters to get sources
  7.    looking more like the original.  ``£`` instead of a backslash,
  8.    ``←`` instead of ``_`` and so on.
  9. :todo: Choose between lower/upper case character sets.
  10. :todo: Allow keyword escape sequences like ``{print}``.
  11. :todo: Option to list keywords as escape sequences.
  12. :todo: Documentation.
  13. :todo: Let user set start address.
  14. :todo: Factor out escape sequence delimiters as constants.
  15. :todo: Options to list keywords and escape sequence names.
  16. :todo: Warn if line numbers are not in ascending order.
  17. """
  18. import logging
  19. import os
  20. import re
  21. import sys
  22. from functools import partial
  23. from itertools import imap
  24. from optparse import OptionParser
  25.  
  26.  
  27. __all__ = ['detokenize', 'tokenize']
  28. __author__ = "Marc 'BlackJack' Rintsch (marc[at]rintsch[dot]de)"
  29. __version__ = '0.0.0'
  30.  
  31. logging.basicConfig(level=logging.INFO)
  32. LOG = logging.getLogger(os.path.basename(__file__))
  33.  
  34. #:
  35. #: Token value of a keyword in the list is the keyword's index + 128.
  36. #:
  37. BASIC_V2_TOKENS = [
  38.     'end', 'for', 'next', 'data', 'input#', 'input', 'dim', 'read', 'let',
  39.     'goto', 'run', 'if', 'restore', 'gosub', 'return', 'rem', 'stop', 'on',
  40.     'wait', 'load', 'save', 'verify', 'def', 'poke', 'print#', 'print', 'cont',
  41.     'list', 'clr', 'cmd', 'sys', 'open', 'close', 'get', 'new', 'tab(', 'to',
  42.     'fn', 'spc(', 'then', 'not', 'step', '+', '-', '*', '/', '^', 'and', 'or',
  43.     '>', '=', '<', 'sgn', 'int', 'abs', 'usr', 'fre', 'pos', 'sqr', 'rnd',
  44.     'log', 'exp', 'cos', 'sin', 'tan', 'atn', 'peek', 'len', 'str$', 'val',
  45.     'asc', 'chr$', 'left$', 'right$', 'mid$', 'go'
  46. ]
  47. #:
  48. #: Mapping of keyword to token value.
  49. #:
  50. KEYWORD2VALUE = dict(
  51.     (t, chr(v)) for v, t in enumerate(BASIC_V2_TOKENS, 128)
  52. )
  53. #:
  54. #: Regular expression for tokenizing the part after the line number.
  55. #:
  56. #: Matches all keywords, strings, and escape sequences outside strings.
  57. #:
  58. TOKENIZE_RE = re.compile(
  59.     '|'.join(
  60.         re.escape(k) for k in sorted(BASIC_V2_TOKENS, key=len, reverse=True)
  61.     )
  62.     + r'|"[^"]*"|\{[^\}]*\}'
  63. )
  64. #:
  65. #: Regular expression to match line numbers at the beginning of a line.
  66. #:
  67. LINE_NUMBER_RE = re.compile(r'\s*\d+')
  68.  
  69. ESCAPE_NAMES = [
  70.     (
  71.         0,
  72.         [
  73.             'null', 'ctrl a', 'ctrl b', 'ctrl c', 'ctrl d', 'white', 'ctrl f',
  74.             'ctrl g', 'up/lo lock on', 'up/lo lock off', 'ctrl j', 'ctrl k',
  75.             'ctrl l', 'return', 'lower case', 'ctrl o', 'ctrl p', 'down',
  76.             'reverse on', 'home', 'delete', 'ctrl u', 'ctrl v', 'ctrl w',
  77.             'ctrl x', 'ctrl y', 'ctrl z', 'esc', 'red', 'right', 'green', 'blue'
  78.         ]
  79.     ),
  80.     (92, 'pound'),
  81.     (95, 'arrow left'),
  82.     (129, 'orange'),
  83.     (
  84.         133,
  85.         ['f1', 'f3','f5', 'f7', 'f2', 'f4', 'f8', 'shift return', 'upper case']
  86.     ),
  87.     (
  88.         144,
  89.         [
  90.             'black', 'up', 'reverse off', 'clear', 'insert', 'brown',
  91.             'light red', 'dark gray', 'gray', 'light green', 'light blue',
  92.             'light gray', 'purple', 'left', 'yellow', 'cyan', 'shift space',
  93.             'cbm k', 'cbm i', 'cbm t', 'cbm @', 'cbm g', 'cbm +', 'cbm m',
  94.             'cbm pound', 'shift pound', 'cbm n', 'cbm q', 'cbm d', 'cbm z',
  95.             'cbm s', 'cbm p', 'cbm a', 'cbm e', 'cbm r', 'cbm w', 'cbm h',
  96.             'cbm j', 'cbm l', 'cbm y', 'cbm u', 'cbm d', 'shift @', 'cbm f',
  97.             'cbm c', 'cbm x', 'cbm v', 'cbm b', 'shift asterisk'
  98.         ]
  99.     ),
  100.     (219, ['shift +', 'cbm -', 'shift -']),
  101.     (223, 'cbm asterisk'),
  102.     (255, 'pi'),
  103. ]
  104.  
  105. def _prepare_escape_name_mapping():
  106.     result = dict()
  107.     for value, names in ESCAPE_NAMES:
  108.         if isinstance(names, basestring):
  109.             names = [names]
  110.         result.update((n, chr(v)) for v, n in enumerate(names, value))
  111.     return result
  112.  
  113. ESCAPE_NAME_TO_VALUE = _prepare_escape_name_mapping()
  114.  
  115. VALUE_TO_UNICODE = {
  116.     '\x5c': u'£',
  117.     '\x5e': u'↑',
  118.     '\x5f': u'←',
  119.     '\xff': u'π',
  120. }
  121.  
  122. def _prepare_unicode_escape_mapping():
  123.     value2escape = dict((v, e) for e, v in ESCAPE_NAME_TO_VALUE.iteritems())
  124.     result = dict()
  125.     for value, unichar in VALUE_TO_UNICODE.iteritems():
  126.         escape = value2escape.get(value)
  127.         result[ord(unichar)] = u'{%s}' % escape if escape else unicode(value)
  128.     return result
  129.  
  130. UNICODE_TO_ESCAPE = _prepare_unicode_escape_mapping()
  131.  
  132.  
  133. class Error(Exception):
  134.     pass
  135.  
  136.  
  137. class TokenizeError(Error):
  138.     pass
  139.  
  140.  
  141. class Detokenizer(object):
  142.     def __init__(self):
  143.         self.is_in_string_mode = False
  144.         #
  145.         # Base mapping: Byte value to numeric escape sequence.
  146.         #
  147.         self.string_mapping = dict((chr(i), '{%03d}' % i) for i in xrange(256))
  148.         #
  149.         # Range 32--64 is a 1:1 mapping.
  150.         #
  151.         self.add_range(32, 64, 32)
  152.         #
  153.         # Unshiftet characters are mapped to lower case ASCII characters.
  154.         #
  155.         self.add_range(65, 90, 97)
  156.         #
  157.         # Shifted characters are mapped to upper case ASCII characters.
  158.         #
  159.         self.add_range(97, 127, 65)
  160.         #
  161.         # Values above 191 are copies from lower ranges.
  162.         #
  163.         self.copy_range(192, 233, 96)
  164.         self.copy_range(224, 254, 160)
  165.         self.copy_range(255, 255, 126)
  166.         #
  167.         # Add symbolic names.
  168.         #
  169.         for name, value in ESCAPE_NAME_TO_VALUE.iteritems():
  170.             self.string_mapping[value] = '{%s}' % name
  171.        
  172.         self.code_mapping = dict(
  173.             (chr(i), kw) for i, kw in enumerate(BASIC_V2_TOKENS, 128)
  174.         )
  175.    
  176.     def add_range(self, start, end, value_start):
  177.         offset = value_start - start
  178.         self.string_mapping.update(
  179.             (chr(i), chr(i + offset)) for i in xrange(start, end + 1)
  180.         )
  181.    
  182.     def copy_range(self, to_start, to_end, from_start):
  183.         offset = from_start - to_start
  184.         self.string_mapping.update(
  185.             (chr(i), self.string_mapping[chr(i + offset)])
  186.             for i in xrange(to_start, to_end + 1)
  187.         )
  188.    
  189.     def decode(self, value):
  190.         if value == '"':
  191.             self.is_in_string_mode = not self.is_in_string_mode
  192.         if not self.is_in_string_mode:
  193.             try:
  194.                 return self.code_mapping[value]
  195.             except KeyError:
  196.                 pass    # Intentionally ignored.
  197.         return self.string_mapping[value]
  198.    
  199.     def detokenize_line(self, line):
  200.         self.is_in_string_mode = False
  201.         return ''.join(imap(self.decode, line))
  202.    
  203.     def detokenize(self, basic_prg, offset=2):
  204.         i = offset
  205.         while get_uint16(basic_prg, i) != 0:
  206.             i += 2  # Skip pointer to next BASIC line.
  207.             line_number = get_uint16(basic_prg, i)
  208.             i += 2  # Skip line number.
  209.             j = basic_prg.index('\0', i)
  210.             tokenized_line = basic_prg[i:j]
  211.             yield (line_number, self.detokenize_line(tokenized_line))
  212.             i = j + 1   # Skip to next line.
  213.  
  214.  
  215. def get_uint16(data, offset):
  216.     return ord(data[offset]) | ord(data[offset + 1]) << 8
  217.  
  218.  
  219. def int2uint16(value):
  220.     if not (0 <= value < 2**16):
  221.         raise ValueError('integer not in range 0..65535')
  222.     high, low = divmod(value, 256)
  223.     return chr(low) + chr(high)
  224.  
  225.  
  226. def detokenize(basic_prg, offset=2):
  227.     return Detokenizer().detokenize(basic_prg, offset)
  228.  
  229.  
  230. def tokenize_escape(name):
  231.     try:
  232.         return ESCAPE_NAME_TO_VALUE[name]
  233.     except KeyError:
  234.         return chr(int(name))
  235.  
  236.  
  237. def tokenize_raw_string(string):
  238.     result = list()
  239.     for value in string:
  240.         tmp = ord(value)
  241.         if tmp == 126:
  242.             value = '\xff'
  243.         elif 'a' <= value <= 'z':
  244.             value = value.upper()
  245.         elif 'A' <= value <= 'Z':
  246.             value = chr(ord(value.lower()) + 96)
  247.         result.append(value)
  248.     return ''.join(result)
  249.  
  250.  
  251. def _tokenize(regex, process_match, process_rest, string):
  252.     result = list()
  253.     i = j = 0
  254.     for match in regex.finditer(string):
  255.         j = match.start()
  256.         if i != j:
  257.             result.append(process_rest(string[i:j]))
  258.         i = match.end()
  259.         data, finished = process_match(match)
  260.         result.append(data)
  261.         if finished:
  262.             i = len(string)
  263.             break
  264.     if i != len(string):
  265.         result.append(process_rest(string[i:]))
  266.     return ''.join(result)
  267.  
  268.  
  269. tokenize_string = partial(
  270.     _tokenize,
  271.     re.compile(r'\{[^\}]*\}'),
  272.     lambda m: (tokenize_escape(m.group()[1:-1]), False),
  273.     tokenize_raw_string
  274. )
  275.  
  276.  
  277. def find_command_separator(line, offset):
  278.     for match in re.compile(r'"[^"]*?"|:').finditer(line, offset):
  279.         if match.group() == ':':
  280.             return match.end()
  281.     return len(line)
  282.  
  283.  
  284. def tokenize_line_content(line):
  285.     def process_match(match):
  286.         result = list()
  287.         data = match.group()
  288.         try:
  289.             result.append(KEYWORD2VALUE[data])
  290.         except KeyError:
  291.             if data.startswith('"'):
  292.                 result.append(tokenize_string(data))
  293.             elif data.startswith('{'):
  294.                 result.append(tokenize_escape(data[1:-1]))
  295.             else:
  296.                 assert False
  297.         else:
  298.             if data == 'rem':
  299.                 result.append(tokenize_string(line[match.end():]))
  300.             elif data == 'data':
  301.                 next_command_offset = find_command_separator(line, match.end())
  302.                 result.append(
  303.                     tokenize_string(line[match.end():next_command_offset])
  304.                 )
  305.                 result.append(tokenize_line_content(line[next_command_offset:]))
  306.         return (''.join(result), data in ['rem', 'data'])
  307.    
  308.     return _tokenize(TOKENIZE_RE, process_match, lambda s: s.upper(), line)
  309.  
  310.  
  311. def tokenize_line(line):
  312.     line_number_match = LINE_NUMBER_RE.match(line)
  313.     if not line_number_match:
  314.         raise TokenizeError('no line number found: %r' % line)
  315.     line_number = int(line_number_match.group())
  316.     if line_number >= 2**16:
  317.         raise TokenizeError('line number too high (%d)' % line_number)
  318.     if line_number >= 64000:
  319.         LOG.warn('line number %d > 64000', line_number)
  320.     line = line[line_number_match.end():].lstrip().rstrip('\r\n')
  321.     if not line.strip():
  322.         LOG.warn('empty line (%d)', line_number)
  323.     return int2uint16(int(line_number)) + tokenize_line_content(line)
  324.  
  325.  
  326. def tokenize(lines, address=0x0801):
  327.     result = [int2uint16(address)]
  328.     for line in lines:
  329.         if line.strip():
  330.             tokenized_line = tokenize_line(line)
  331.             address += len(tokenized_line) + 3
  332.             result.append(int2uint16(address) + tokenized_line + '\0')
  333.     result.append(int2uint16(0))
  334.     return ''.join(result)
  335.  
  336.  
  337. def ascii_to_utf8(string):
  338.     def replace_func(match):
  339.         escape = match.group()
  340.         return VALUE_TO_UNICODE.get(tokenize_escape(escape[1:-1]), escape)
  341.     return re.sub(r'\{[^}]+\}', replace_func, string).encode('utf-8')
  342.  
  343.  
  344. def utf8_to_ascii(string):
  345.     return string.decode('utf-8').translate(UNICODE_TO_ESCAPE).encode('ascii')
  346.  
  347.  
  348. def main():
  349.     parser = OptionParser(
  350.         usage='%prog [options] detokenize|tokenize',
  351.         version=__version__,
  352.         description='CBM BASIC V2 (de)tokenizer.',
  353.         epilog='Written by %s.' % __author__
  354.     )
  355.     parser.add_option('-i', '--input', metavar='FILE', action='store',
  356.         default=None, help='input filename (default: <stdin>)')
  357.     parser.add_option('-o', '--output', metavar='FILE', action='store',
  358.         default=None, help='ouput filename (default: <stdout>)')
  359.     parser.add_option('--unicode', action='store_true', default=False,
  360.         help='convert certain charaters and escape sequences to/from unicode.'
  361.             ' The encoding is UTF-8.')
  362.     parser.add_option('--debug', action='store_true', default=False,
  363.         help='activate debugging output like complete stack traces')
  364.    
  365.     options, arguments = parser.parse_args()
  366.    
  367.     if options.debug:
  368.         LOG.setLevel(logging.DEBUG)
  369.    
  370.     if not arguments or arguments[0] not in ['detokenize', 'tokenize']:
  371.         parser.error('please choose operation mode')
  372.    
  373.     try:
  374.         in_file = open(options.input, 'rb') if options.input else sys.stdin
  375.         out_file = open(options.input, 'wb') if options.output else sys.stdout
  376.  
  377.         command = arguments[0]
  378.         if command == 'detokenize':
  379.             for line_number, line in detokenize(in_file.read()):
  380.                 if options.unicode:
  381.                     line = ascii_to_utf8(line)
  382.                 out_file.write('%5d %s\n' % (line_number, line))
  383.         elif command == 'tokenize':
  384.             if options.unicode:
  385.                 in_file = imap(utf8_to_ascii, in_file)
  386.             out_file.write(tokenize(in_file))
  387.         else:
  388.             assert False, 'command %r should not be possible' % command
  389.     except Exception, error:
  390.         if options.debug:
  391.             raise
  392.         else:
  393.             parser.error(error)
  394.  
  395.  
  396. if __name__ == '__main__':
  397.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement