Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- # this should output object tree from own config format
- import re
- class ZTokenizerError(Exception):
- pass
- class ZTokenizer:
- def __init__(self, buf):
- self.buffer = buf
- self.line = 0
- self.position = 0
- # token is either identifier, integer, float, string or other.
- # comments are ignored.
- def require_identifier(self):
- v = self.get_identifier()
- if v is None:
- raise ZTokenizerError('Expected identifier at line %d' % self.line)
- return v
- def get_identifier(self):
- # identifier is something that contains only A-Za-z0-9_
- try:
- v = re.search(r'^([A-Za-z_][A-Za-z0-9_]*).*', self.buffer[self.position:]).group(1)
- self.position += len(v)
- except:
- return None
- return v
- def require_whitespace(self):
- v = self.get_whitespace()
- if v is None:
- raise ZTokenizerError('Expected whitespace at line %d' % self.line)
- return v
- def get_only_whitespace(self):
- try:
- v = re.search(r'^([\s\t\r\n]+).*', self.buffer[self.position:]).group(1)
- if v is None or not v:
- return 0
- self.position += len(v)
- # check for newlines, advance line for all newlines
- self.line += v.count('\n')
- return len(v)
- except:
- return 0
- def get_whitespace(self):
- gwl = 0
- while True:
- wl = 0
- wl += self.get_only_whitespace()
- # try to find nearest one-line comment
- try:
- v = re.search(r'^(\/\/[^\n]*)', self.buffer[self.position:]).group(1)
- # imagine v is found.
- wl += len(v)
- self.position += len(v)
- # usually adds one newline.
- self.line += v.count('\n')
- except:
- pass
- wl += self.get_only_whitespace()
- # try to find nearest multiline comment
- if self.buffer[self.position:self.position+2] == '/*':
- npos = self.buffer.find('*/', self.position+2)
- if npos < 0:
- raise ZTokenizerError('No multiline comment end found at line %d' % self.line)
- v = self.buffer[self.position:npos+2]
- # v should now have all comment block
- wl += len(v)
- self.position += len(v)
- # add lines
- self.line += v.count('\n')
- wl += self.get_only_whitespace()
- gwl += wl
- if wl <= 0:
- break
- return gwl
- # "other" are characters (), {}, [], -, +, /, *, =, !, ?, :, ;.
- # when you require this, you can also specify which character you need.
- def require_other(self, which='(){}[]-+/*=!?:;.<>'):
- v = self.get_other()
- if v is None or v not in which:
- raise ZTokenizerError('Expected one of "%s" at line %d' % (which, self.line))
- return v
- def get_other(self, which='(){}[]-+/*=!?:;.<>'):
- try:
- v = re.search(r'^([\(\)\{\}\[\]\-\+/\*\=\!\?\:\;\.\<\>]).*', self.buffer[self.position:]).group(1)
- if v not in which:
- return None
- self.position += len(v)
- except:
- return None
- return v
- def require_string(self):
- v = self.get_string()
- if v is None:
- raise ZTokenizerError('Expected string literal at line %d' % self.line)
- return v
- def get_escape(self, sequence):
- seq_dict = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\', '"': '"'}
- try:
- if sequence[0] == 'x' or sequence[0] == 'u': # hexadecimal character
- v = re.search(r'^([0-9A-Fa-f]{1,4}).*', sequence[1:]).group(1)
- return len(v)+1, unichr(int(v, 16))
- elif sequence[0] == '0': # octal character
- v = re.search(r'^([0-8]{1,6}).*', sequence[1:]).group(1)
- return len(v)+1, unichr(int(v, 8))
- elif sequence[0] in seq_dict: # simple character
- return 1, seq_dict[sequence[0]]
- except:
- pass
- return None, None # unknown/invalid escape causes discarding of whole string
- def get_string(self):
- try:
- v = re.search(r'^(\"([^\\\"]|\\.)*\").*', self.buffer[self.position:]).group(1)
- self.position += len(v)
- # now that we got the string, unescape it
- v = v[1:-1]
- ov = ''
- i = -1
- while i+1 < len(v):
- i += 1
- if v[i] == '\\': # escaped character. this is either \0####, \x####, \u####, \r, \n, \t or \\ or \"
- charlen, char = self.get_escape(v[i+1:])
- if char is None:
- return None
- ov += char
- i += charlen
- else:
- ov += v[i]
- return ov
- except:
- pass
- return None
- def require_integer(self):
- v = self.get_integer()
- if v is None:
- raise ZTokenizerError('Expected integer at line %d' % self.line)
- return v
- def get_integer(self):
- try:
- # formats allowed:
- # 0x... (hexadecimal)
- # 0... (octal)
- # decimal
- # allow for negative.
- mul = 1
- if self.buffer[self.position] == '-':
- mul = -1
- self.position += 1
- v = re.search(r'^(0x[A-Fa-f0-9]+|0[0-8]+|[0-9]+).*', self.buffer[self.position:]).group(1)
- self.position += len(v)
- if v[0:2] == '0x':
- return int(v[2:], 16) * mul
- elif v[0] == '0':
- return int(v[1:], 8) * mul
- return int(v, 10) * mul
- except:
- return None
- def require_float(self):
- v = self.get_float()
- if v is None:
- raise ZTokenizerError('Expected float at line %d' % self.line)
- return v
- def get_float(self):
- try:
- # any count of decimal digits
- # then dot (required)
- # then any count of decimal digits and e or - (this is a hack to allow 1e-2 notation)
- v = re.search(r'^([0-9]*([0-9\-e]+|\.[0-9\-e]*)?).*', self.buffer[self.position:]).group(1)
- self.position += len(v)
- return float(v)
- except:
- return None
- def is_eof(self):
- return self.position >= len(self.buffer)
Add Comment
Please, Sign In to add comment