- # encoding: utf-8
- from __future__ import print_function
- import string
- import pprint
- class InlineToken(object):
- def __repr__(self):
- return "Token(%s)" % (self.tag, )
- def __init__(self, start, end=None, tag=None):
- super(InlineToken, self).__init__()
- self.start = start
- self.end = end if end else start
- self.tag = tag
- def validate(self, stream):
- # print("\n\t%r.validate() == %r" % (self, stream[0] == self.start and stream.find(self.end, 1) > 0))
- return stream[0] == self.start and stream.find(self.end, 1) > 0
- def enter(self, stream):
- yield stream
- yield ('enter', self.tag)
- def exit(self, stream):
- yield stream
- yield ('exit', self.tag)
- class LongInlineToken(InlineToken):
- def validate(self, stream):
- # print("\n\t%r.validate() == %r" % (stream.startswith(self.start) and stream.find(self.end, len(self.start)) > 0, ))
- return stream.startswith(self.start) and stream.find(self.end, len(self.start)) > 0
- class UnformattedToken(InlineToken):
- """No substitutions should appear within this token."""
- def enter(self, stream):
- end = 0
- while True:
- end = stream.index(self.end, end + 1)
- if stream[end-1] != '\\':
- break
- stream = stream[:end-1] + stream[end:]
- end -= 1
- yield stream[end:]
- yield ('enter', self.tag)
- yield ('text', stream[:end])
- def exit(self, stream):
- yield stream
- yield ('exit', self.tag)
- class LinkToken(InlineToken):
- """No substitutions should appear within this token."""
- def enter(self, stream):
- yield stream
- yield ('enter', self.tag)
- def exit(self, stream):
- linkbreak = string.ascii_letters + string.digits + '-_:/@#'
- for i, c in enumerate(stream):
- if c not in linkbreak:
- yield stream[i:]
- yield ('attr', ('href', stream[:i]))
- break
- else:
- yield stream
- yield ('exit', self.tag)
- class FootnoteToken(InlineToken):
- def enter(self, stream):
- end = 0
- while True:
- end = stream.index(self.end, end + 1)
- if stream[end-1] != '\\':
- break
- stream = stream[:end-1] + stream[end:]
- end -= 1
- fn = stream[:end]
- if not fn.isdigit():
- # TODO: Store the footnote in the current parser and get index.
- fn = '0'
- yield stream[end:]
- yield ('enter', 'sup')
- yield ('enter', 'a')
- yield ('attr', ('href', '#fn' + fn))
- yield ('text', fn)
- def exit(self, stream):
- yield stream
- yield ('exit', 'a')
- yield ('exit', 'sup')
- class InlineRegistry(object):
- def __init__(self):
- super(InlineRegistry, self).__init__()
- self.tokens = dict()
- def register(self, token, symbol=None):
- if symbol:
- start, end = (symbol if isinstance(symbol, tuple) == 2 else (symbol, None))
- token = InlineToken(start, end, token) if len(start) == 1 else LongInlineToken(start, end, token)
- if token.start[0] not in self.tokens:
- self.tokens[token.start[0]] = []
- self.tokens[token.start[0]].append(token)
- registry = InlineRegistry()
- registry.register('strong', '*')
- registry.register('emphasis', '_')
- registry.register('del', '-')
- registry.register('ins', '+')
- registry.register('span', '%')
- registry.register('sup', '^')
- registry.register('sub', '~')
- registry.register('cite', '??')
- registry.register('b', '**')
- registry.register('i', '__')
- registry.register(UnformattedToken('@', tag='code'))
- registry.register(LinkToken('"', '":', tag='a'))
- registry.register(FootnoteToken('[', ']'))
- def tokenize(source):
- source = source
- stack = []
- tokens = registry.tokens
- while source:
- # print("\nSource:", repr(source), "\n\tEncountered: ", end="")
- for i, char in enumerate(source):
- # print(char, end="")
- # Handle exiting.
- token = stack[-1] if stack else None
- if token and source.find(token.end) == i:
- # print()
- if i > 0:
- if source[i-1] == '\\':
- yield 'text', source[:i-2] + char
- source = source[:i]
- break
- yield 'text', source[:i]
- source = source[i:]
- emitter = token.exit(source[len(token.end):])
- source = emitter.next()
- # # print("New Source:", source)
- for chunk in emitter:
- if chunk[0] == 'enter':
- stack.append(token)
- if chunk[0] == 'exit':
- stack.pop()
- yield chunk
- break
- # Handle entering.
- if char not in tokens:
- continue
- remainder = source if i == 0 else source[i:]
- for token in tokens[char]:
- if token.validate(remainder):
- break
- else:
- continue
- if i > 0:
- if source[i-1] == '\\':
- # print("\n%r, %r, %r\n" % (source[:i-1], char, source[i:]))
- yield 'text', source[:i-1] + char
- source = source[i+1:]
- break
- yield 'text', source[:i]
- source = remainder
- emitter = token.enter(source[len(token.start):])
- source = emitter.next()
- # print("New Source:", source)
- for chunk in emitter:
- if chunk[0] == 'enter':
- stack.append(token)
- elif chunk[0] == 'exit':
- stack.pop()
- yield chunk
- break
- if source and i == len(source) - 1:
- yield 'text', source
- break
- def main():
- # pprint.pprint(registry.tokens)
- result = ''
- source = 'This @example@ is "neither":/defn/neither complete[1] \[bob!] *nor* trite, *though _simple_*.'
- print("Source:", repr(source), end="\n\n")
- level = 0
- for token, value in tokenize(source):
- if token == 'exit':
- level -= 1
- result += '</{0}>'.format(value)
- # print("{0}{1}: {2}".format(" " * level, token, repr(value)))
- print("{0}{1},".format(" " * level, repr((token, value))))
- if token == 'enter':
- level += 1
- result += '<{0}>'.format(value)
- if token == 'text':
- result += value
- print("\nResult:", repr(result))
- if __name__ == '__main__':
- main()