Untitled

# this should output object tree from own config format
import re


class ConfigTokenizerError(Exception):
    pass

class ConfigParserError(Exception):
    pass

class ConfigError(Exception):
    pass

class ConfigTokenizer:
    def __init__(self, buf):
        self.buffer = buf
        self.line = 0
        self.position = 0

    # token is either identifier, integer, float, string or other.
    # comments are ignored.
    def require_identifier(self):
        v = self.get_identifier()
        if v is None:
            raise ConfigTokenizerError('Expected identifier at line %d' % self.line)
        return v

    def get_identifier(self):
        # identifier is something that contains only A-Za-z0-9_
        try:
            v = re.search(r'^([A-Za-z0-9_]+).*', self.buffer[self.position:]).group(1)
            self.position += len(v)
        except:
            return None
        return v

    def require_whitespace(self):
        v = self.get_whitespace()
        if v is None:
            raise ConfigTokenizerError('Expected whitespace at line %d' % self.line)
        return v

    def get_only_whitespace(self):
        try:
            v = re.search(r'^([\s\t\r\n]+).*', self.buffer[self.position:]).group(1)
            if v is None or not v:
                return 0
            self.position += len(v)
            # check for newlines, advance line for all newlines
            self.line += v.count('\n')
            return len(v)
        except:
            return 0

    def get_whitespace(self):
        gwl = 0
        while True:
            wl = 0
            wl += self.get_only_whitespace()
            # try to find nearest one-line comment
            try:
                v = re.search(r'^(\/\/[^\n]*)', self.buffer[self.position:]).group(1)
                # imagine v is found.
                wl += len(v)
                self.position += len(v)
                # usually adds one newline.
                self.line += v.count('\n')
            except:
                pass
            wl += self.get_only_whitespace()
            # try to find nearest multiline comment
            if self.buffer[self.position:self.position+2] == '/*':
                npos = self.buffer.find('*/', self.position+2)
                if npos < 0:
                    raise ConfigTokenizerError('No multiline comment end found at line %d' % self.line)
                v = self.buffer[self.position:npos+2]
                # v should now have all comment block
                wl += len(v)
                self.position += len(v)
                # add lines
                self.line += v.count('\n')
            wl += self.get_only_whitespace()
            gwl += wl
            if wl <= 0:
                break
        return gwl

    # "other" are characters (), {}, [], -, +, /, *, =, !, ?, :, ;.
    # when you require this, you can also specify which character you need.
    def require_other(self, which='(){}[]-+/*=!?:;.'):
        v = self.get_other()
        if v is None or v not in which:
            raise ConfigTokenizerError('Expected one of "%s" at line %d' % (which, self.line))
        return v

    def get_other(self, which='(){}[]-+/*=!?:;.'):
        try:
            v = re.search(r'^([\(\)\{\}\[\]\-\+/\*\=\!\?\:\;\.]).*', self.buffer[self.position:]).group(1)
            if v not in which:
                return None
            self.position += len(v)
        except:
            return None
        return v

    def require_string(self):
        v = self.get_string()
        if v is None:
            raise ConfigTokenizerError('Expected string literal at line %d' % self.line)
        return v

    def get_escape(self, sequence):
        seq_dict = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\', '"': '"'}
        try:
            if sequence[0] == 'x' or sequence[0] == 'u': # hexadecimal character
                v = re.search(r'^([0-9A-Fa-f]{1,4}).*', sequence[1:]).group(1)
                return len(v)+1, unichr(int(v, 16))
            elif sequence[0] == '0': # octal character
                v = re.search(r'^([0-8]{1,6}).*', sequence[1:]).group(1)
                return len(v)+1, unichr(int(v, 8))
            elif sequence[0] in seq_dict: # simple character
                return 1, seq_dict[sequence[0]]
        except:
            pass
        return None, None # unknown/invalid escape causes discarding of whole string

    def get_string(self):
        try:
            v = re.search(r'^(\"([^\\\"]|\\.)*\").*', self.buffer[self.position:]).group(1)
            self.position += len(v)
            # now that we got the string, unescape it
            v = v[1:-1]
            ov = ''
            i = -1
            while i+1 < len(v):
                i += 1
                if v[i] == '\\': # escaped character. this is either \0####, \x####, \u####, \r, \n, \t or \\ or \"
                    charlen, char = self.get_escape(v[i+1:])
                    if char is None:
                        return None
                    ov += char
                    i += charlen
                else:
                    ov += v[i]
            return ov
        except:
            pass
        return None

    def require_integer(self):
        v = self.get_integer()
        if v is None:
            raise ConfigTokenizerError('Expected integer at line %d' % self.line)
        return v

    def get_integer(self):
        try:
            # formats allowed:
            # 0x... (hexadecimal)
            # 0... (octal)
            # decimal
            # allow for negative.
            mul = 1
            if self.buffer[self.position] == '-':
                mul = -1
                self.position += 1
            v = re.search(r'^(0x[A-Fa-f0-9]+|0[0-8]+|[0-9]+).*', self.buffer[self.position:]).group(1)
            self.position += len(v)
            if v[0:2] == '0x':
                return int(v[2:], 16) * mul
            elif v[0] == '0':
                return int(v[1:], 8) * mul
            return int(v, 10) * mul
        except:
            return None

    def require_float(self):
        v = self.get_float()
        if v is None:
            raise ConfigTokenizerError('Expected float at line %d' % self.line)
        return v

    def get_float(self):
        try:
            # any count of decimal digits
            # then dot (required)
            # then any count of decimal digits and e or - (this is a hack to allow 1e-2 notation)
            v = re.search(r'^([0-9]*([0-9\-e]+|\.[0-9\-e]*)?).*', self.buffer[self.position:]).group(1)
            self.position += len(v)
            return float(v)
        except:
            return None

    def is_eof(self):
        return self.position >= len(self.buffer)


class ConfigParser:
    def is_directory_type(self, type):
        return (type in ['directory', 'test'])

    def __init__(self, filename):
        self.data = {'type': 'directory', 'value': {}} # data = list of directories

        with open(filename, 'r') as f:
            tr = ConfigTokenizer(f.read())

        dir_stack = [self.data]
        while not tr.is_eof():
            c_type = tr.get_identifier()
            if c_type is not None:
                tr.get_whitespace()
                c_name = tr.require_identifier()
                tr.get_whitespace()
                # then should come either = or {, depending on type.
                # we also might get :, if this inherits from another field (that field should exist already)
                c_parent = None
                if self.is_directory_type(c_type):
                    c_inheritance = tr.get_other(':')
                    if c_inheritance is not None:
                        tr.get_whitespace()
                        c_parent = tr.require_identifier()
                        while True:
                            if tr.get_other('.') is not None:
                                c_parent_part = tr.require_identifier()
                                c_parent += '.'+c_parent_part
                                continue
                            break
                        tr.get_whitespace()
                c_operator = tr.require_other('{=')
                if c_operator == '=':
                    tr.get_whitespace()
                    if c_type == 'string':
                        c_value = tr.require_string()
                    elif c_type == 'int':
                        c_value = tr.require_integer()
                    elif c_type == 'float':
                        c_value = tr.require_float()
                    elif c_type == 'bool':
                        c_value = tr.require_identifier()
                        if c_value == 'true':
                            c_value = True
                        elif c_value == 'false':
                            c_value = False
                        else:
                            raise ConfigParserError('"true" or "false" expected at line %d' % tr.line)
                    else:
                        raise ConfigParserError('Type %s is not a field type at line %d' % (c_type, tr.line))
                    if c_name in dir_stack[-1]['value']: # already exists, duplicate
                        raise ConfigParserError('Duplicate value at line %d' % tr.line)
                    dir_stack[-1]['value'][c_name] = {'type': c_type, 'value': c_value}
                else: # directory
                    if not self.is_directory_type(c_type):
                        raise ConfigParserError('Type %s is not a directory type at line %d' % (c_type, tr.line))
                    # otherwise make dir
                    if c_name in dir_stack[-1]['value']: # already exists, duplicate
                        raise ConfigParserError('Duplicate value at line %d' % tr.line)
                    # process inheritance
                    dir = {'type': c_type, 'value': {}}
                    if c_parent is not None:
                        if c_parent not in dir_stack[-1]['value']:
                            parent = self.get_node(c_parent)
                            if parent is None:
                                raise ConfigParserError('Unknown inherited directory at line %d' % tr.line)
                        else:
                            parent = dir_stack[-1]['value'][c_parent]
                        if not self.is_directory_type(parent['type']):
                            raise ConfigParserError('Inherited field is not a directory type (%s) at line %d' % (parent['type'], tr.line))
                        for k in parent['value']:
                            dir['value'][k] = parent['value'][k]
                    dir_stack[-1]['value'][c_name] = dir
                    dir_stack.append(dir)
                tr.get_whitespace()
            # check for closing bracket if we're in a directory
            closing_brace = tr.get_other('}')
            if closing_brace is not None:
                if len(dir_stack) == 1:
                    raise ConfigParserError('Bracket mismatch at line %d' % tr.line)
                dir_stack = dir_stack[:-1] # go one level back
                tr.get_whitespace()

    # get value by path (dot-separated)
    def get_node(self, path):
        path = path.split('.')
        cnode = self.data
        for i in range(len(path)):
            if not path[i]:
                continue
            if not self.is_directory_type(cnode['type']):
                return None
            if path[i] not in cnode['value']:
                return None
            cnode = cnode['value'][path[i]]
        return cnode

    def get_json_from_node(self, node):
        if self.is_directory_type(node['type']):
            # return map of objects
            out = {}
            for k in node['value']:
                out[k] = self.get_json_from_node(node['value'][k])
            return out
        else:
            return node['value']

    def get_json(self, path):
        node = self.get_node(path)
        return self.get_json_from_node(node)

    def get_int(self, path):
        node = self.get_node(path)
        if node is None or not isinstance(node['value'], int):
            raise ConfigError("Path %s not found or not an integer"%path)
        return node['value']

    def get_float(self, path):
        node = self.get_node(path)
        if node is None or not isinstance(node['value'], float):
            raise ConfigError("Path %s not found or not a float"%path)
        return node['value']

    def get_bool(self, path):
        node = self.get_node(path)
        if node is None or not isinstance(node['value'], bool):
            raise ConfigError("Path %s not found or not a bool"%path)
        return node['value']

    def get_string(self, path):
        node = self.get_node(path)
        if node is None or (not isinstance(node['value'], unicode) and not isinstance(node['value'], str)):
            raise ConfigError("Path %s not found or not a string"%path)
        return node['value']