jewalky

tokenizer.py

Mar 1st, 2016
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.77 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. # this should output object tree from own config format
  4. import re
  5.  
  6.  
  7. class ZTokenizerError(Exception):
  8.     pass
  9.    
  10. class ZTokenizer:
  11.     def __init__(self, buf):
  12.         self.buffer = buf
  13.         self.line = 0
  14.         self.position = 0
  15.        
  16.     # token is either identifier, integer, float, string or other.
  17.     # comments are ignored.
  18.     def require_identifier(self):
  19.         v = self.get_identifier()
  20.         if v is None:
  21.             raise ZTokenizerError('Expected identifier at line %d' % self.line)
  22.         return v
  23.        
  24.     def get_identifier(self):
  25.         # identifier is something that contains only A-Za-z0-9_
  26.         try:
  27.             v = re.search(r'^([A-Za-z_][A-Za-z0-9_]*).*', self.buffer[self.position:]).group(1)
  28.             self.position += len(v)
  29.         except:
  30.             return None
  31.         return v
  32.        
  33.     def require_whitespace(self):
  34.         v = self.get_whitespace()
  35.         if v is None:
  36.             raise ZTokenizerError('Expected whitespace at line %d' % self.line)
  37.         return v
  38.        
  39.     def get_only_whitespace(self):
  40.         try:
  41.             v = re.search(r'^([\s\t\r\n]+).*', self.buffer[self.position:]).group(1)
  42.             if v is None or not v:
  43.                 return 0
  44.             self.position += len(v)
  45.             # check for newlines, advance line for all newlines
  46.             self.line += v.count('\n')
  47.             return len(v)
  48.         except:
  49.             return 0
  50.        
  51.     def get_whitespace(self):
  52.         gwl = 0
  53.         while True:
  54.             wl = 0
  55.             wl += self.get_only_whitespace()
  56.             # try to find nearest one-line comment
  57.             try:
  58.                 v = re.search(r'^(\/\/[^\n]*)', self.buffer[self.position:]).group(1)
  59.                 # imagine v is found.
  60.                 wl += len(v)
  61.                 self.position += len(v)
  62.                 # usually adds one newline.
  63.                 self.line += v.count('\n')
  64.             except:
  65.                 pass
  66.             wl += self.get_only_whitespace()
  67.             # try to find nearest multiline comment
  68.             if self.buffer[self.position:self.position+2] == '/*':
  69.                 npos = self.buffer.find('*/', self.position+2)
  70.                 if npos < 0:
  71.                     raise ZTokenizerError('No multiline comment end found at line %d' % self.line)
  72.                 v = self.buffer[self.position:npos+2]
  73.                 # v should now have all comment block
  74.                 wl += len(v)
  75.                 self.position += len(v)
  76.                 # add lines
  77.                 self.line += v.count('\n')
  78.             wl += self.get_only_whitespace()
  79.             gwl += wl
  80.             if wl <= 0:
  81.                 break
  82.         return gwl
  83.        
  84.     # "other" are characters (), {}, [], -, +, /, *, =, !, ?, :, ;.
  85.     # when you require this, you can also specify which character you need.
  86.     def require_other(self, which='(){}[]-+/*=!?:;.<>'):
  87.         v = self.get_other()
  88.         if v is None or v not in which:
  89.             raise ZTokenizerError('Expected one of "%s" at line %d' % (which, self.line))
  90.         return v
  91.        
  92.     def get_other(self, which='(){}[]-+/*=!?:;.<>'):
  93.         try:
  94.             v = re.search(r'^([\(\)\{\}\[\]\-\+/\*\=\!\?\:\;\.\<\>]).*', self.buffer[self.position:]).group(1)
  95.             if v not in which:
  96.                 return None
  97.             self.position += len(v)
  98.         except:
  99.             return None
  100.         return v
  101.        
  102.     def require_string(self):
  103.         v = self.get_string()
  104.         if v is None:
  105.             raise ZTokenizerError('Expected string literal at line %d' % self.line)
  106.         return v
  107.        
  108.     def get_escape(self, sequence):
  109.         seq_dict = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\', '"': '"'}
  110.         try:
  111.             if sequence[0] == 'x' or sequence[0] == 'u': # hexadecimal character
  112.                 v = re.search(r'^([0-9A-Fa-f]{1,4}).*', sequence[1:]).group(1)
  113.                 return len(v)+1, unichr(int(v, 16))
  114.             elif sequence[0] == '0': # octal character
  115.                 v = re.search(r'^([0-8]{1,6}).*', sequence[1:]).group(1)
  116.                 return len(v)+1, unichr(int(v, 8))
  117.             elif sequence[0] in seq_dict: # simple character
  118.                 return 1, seq_dict[sequence[0]]
  119.         except:
  120.             pass
  121.         return None, None # unknown/invalid escape causes discarding of whole string
  122.        
  123.     def get_string(self):
  124.         try:
  125.             v = re.search(r'^(\"([^\\\"]|\\.)*\").*', self.buffer[self.position:]).group(1)
  126.             self.position += len(v)
  127.             # now that we got the string, unescape it
  128.             v = v[1:-1]
  129.             ov = ''
  130.             i = -1
  131.             while i+1 < len(v):
  132.                 i += 1
  133.                 if v[i] == '\\': # escaped character. this is either \0####, \x####, \u####, \r, \n, \t or \\ or \"
  134.                     charlen, char = self.get_escape(v[i+1:])
  135.                     if char is None:
  136.                         return None
  137.                     ov += char
  138.                     i += charlen
  139.                 else:
  140.                     ov += v[i]
  141.             return ov
  142.         except:
  143.             pass
  144.         return None
  145.        
  146.     def require_integer(self):
  147.         v = self.get_integer()
  148.         if v is None:
  149.             raise ZTokenizerError('Expected integer at line %d' % self.line)
  150.         return v
  151.        
  152.     def get_integer(self):
  153.         try:
  154.             # formats allowed:
  155.             # 0x... (hexadecimal)
  156.             # 0... (octal)
  157.             # decimal
  158.             # allow for negative.
  159.             mul = 1
  160.             if self.buffer[self.position] == '-':
  161.                 mul = -1
  162.                 self.position += 1
  163.             v = re.search(r'^(0x[A-Fa-f0-9]+|0[0-8]+|[0-9]+).*', self.buffer[self.position:]).group(1)
  164.             self.position += len(v)
  165.             if v[0:2] == '0x':
  166.                 return int(v[2:], 16) * mul
  167.             elif v[0] == '0':
  168.                 return int(v[1:], 8) * mul
  169.             return int(v, 10) * mul
  170.         except:
  171.             return None
  172.            
  173.     def require_float(self):
  174.         v = self.get_float()
  175.         if v is None:
  176.             raise ZTokenizerError('Expected float at line %d' % self.line)
  177.         return v
  178.  
  179.     def get_float(self):
  180.         try:
  181.             # any count of decimal digits
  182.             # then dot (required)
  183.             # then any count of decimal digits and e or - (this is a hack to allow 1e-2 notation)
  184.             v = re.search(r'^([0-9]*([0-9\-e]+|\.[0-9\-e]*)?).*', self.buffer[self.position:]).group(1)
  185.             self.position += len(v)
  186.             return float(v)
  187.         except:
  188.             return None
  189.            
  190.     def is_eof(self):
  191.         return self.position >= len(self.buffer)
Add Comment
Please, Sign In to add comment