Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class Lexer
- KEYWORDS = ["def", "class", "if", "true", "false", "nil"]
- def tokenize(code)
- # Cleanup code by remove extra line breaks
- code.chomp!
- # Current character position we're parsing
- i = 0
- # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
- tokens = []
- # Current indent level is the number of spaces in the last indent.
- current_indent = 0
- # We keep track of the indentation levels we are in so that when we dedent, we can
- # check if we're on the correct level.
- indent_stack = []
- # This is how to implement a very simple scanner.
- # Scan one character at the time until you find something to parse.
- while i < code.size
- chunk = code[i..-1]
- # Matching standard tokens.
- #
- # Matching if, print, method names, etc.
- if identifier = chunk[/\A([a-z]\w*)/, 1]
- # Keywords are special identifiers tagged with their own name, 'if' will result
- # in an [:IF, "if"] token
- if KEYWORDS.include?(identifier)
- tokens << [identifier.upcase.to_sym, identifier]
- # Non-keyword identifiers include method and variable names.
- else
- tokens << [:IDENTIFIER, identifier]
- end
- # skip what we just parsed
- i += identifier.size
- # Matching class names and constants starting with a capital letter.
- elsif constant = chunk[/\A([A-Z]\w*)/, 1]
- tokens << [:CONSTANT, constant]
- i += constant.size
- elsif number = chunk[/\A([0-9]+)/, 1]
- tokens << [:NUMBER, number.to_i]
- i += number.size
- elsif string = chunk[/\A"(.*?)"/, 1]
- tokens << [:STRING, string]
- i += string.size + 2
- # Here's the indentation magic!
- #
- # We have to take care of 3 cases:
- #
- # if true: # 1) the block is created
- # line 1
- # line 2 # 2) new line inside a block
- # continue # 3) dedent
- #
- # This elsif takes care of the first case. The number of spaces will determine
- # the indent level.
- elsif indent = chunk[/\A\:\n( +)/m, 1] # Matches ": <newline> <spaces>"
- # When we create a new block we expect the indent level to go up.
- if indent.size <= current_indent
- raise "Bad indent level, got #{indent.size} indents, " +
- "expected > #{current_indent}"
- end
- # Adjust the current indentation level.
- current_indent = indent.size
- indent_stack.push(current_indent)
- tokens << [:INDENT, indent.size]
- i += indent.size + 2
- # This elsif takes care of the two last cases:
- # Case 2: We stay in the same block if the indent level (number of spaces) is the
- # same as current_indent.
- # Case 3: Close the current block, if indent level is lower than current_indent.
- elsif indent = chunk[/\A\n( *)/m, 1] # Matches "<newline> <spaces>"
- if indent.size == current_indent # Case 2
- # Nothing to do, we're still in the same block
- tokens << [:NEWLINE, "\n"]
- elsif indent.size < current_indent # Case 3
- while indent.size < current_indent
- indent_stack.pop
- current_indent = indent_stack.last || 0
- tokens << [:DEDENT, indent.size]
- end
- tokens << [:NEWLINE, "\n"]
- else # indent.size > current_indent, error!
- # Cannot increase indent level without using ":", so this is an error.
- raise "Missing ':'"
- end
- i += indent.size + 1
- # Match long operators such as ||, &&, ==, !=, <= and >=.
- # One character long operators are matched by the catch all `else` at the bottom.
- elsif operator = chunk[/\A(\|\||&&|==|!=|<=|>=)/, 1]
- tokens << [operator, operator]
- i += operator.size
- # Ignore whitespace
- elsif chunk.match(/\A /)
- i += 1
- # Catch all single characters
- # We treat all other single characters as a token. Eg.: ( ) , . ! + - <
- else
- value = chunk[0,1]
- tokens << [value, value]
- i += 1
- end
- end
- # Close all open blocks
- while indent = indent_stack.pop
- tokens << [:DEDENT, indent_stack.first || 0]
- end
- tokens
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement