Advertisement
Guest User

Untitled

a guest
Dec 22nd, 2012
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 4.38 KB | None | 0 0
  1. class Lexer
  2.   KEYWORDS = ["def", "class", "if", "true", "false", "nil"]
  3.  
  4.   def tokenize(code)
  5.     # Cleanup code by remove extra line breaks
  6.     code.chomp!
  7.    
  8.     # Current character position we're parsing
  9.     i = 0
  10.    
  11.     # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
  12.     tokens = []
  13.    
  14.     # Current indent level is the number of spaces in the last indent.
  15.     current_indent = 0
  16.     # We keep track of the indentation levels we are in so that when we dedent, we can
  17.     # check if we're on the correct level.
  18.     indent_stack = []
  19.    
  20.     # This is how to implement a very simple scanner.
  21.     # Scan one character at the time until you find something to parse.
  22.     while i < code.size
  23.       chunk = code[i..-1]
  24.      
  25.       # Matching standard tokens.
  26.       #
  27.       # Matching if, print, method names, etc.
  28.       if identifier = chunk[/\A([a-z]\w*)/, 1]
  29.         # Keywords are special identifiers tagged with their own name, 'if' will result
  30.         # in an [:IF, "if"] token
  31.         if KEYWORDS.include?(identifier)
  32.           tokens << [identifier.upcase.to_sym, identifier]
  33.         # Non-keyword identifiers include method and variable names.
  34.         else
  35.           tokens << [:IDENTIFIER, identifier]
  36.         end
  37.         # skip what we just parsed
  38.         i += identifier.size
  39.      
  40.       # Matching class names and constants starting with a capital letter.
  41.       elsif constant = chunk[/\A([A-Z]\w*)/, 1]
  42.         tokens << [:CONSTANT, constant]
  43.         i += constant.size
  44.        
  45.       elsif number = chunk[/\A([0-9]+)/, 1]
  46.         tokens << [:NUMBER, number.to_i]
  47.         i += number.size
  48.        
  49.       elsif string = chunk[/\A"(.*?)"/, 1]
  50.         tokens << [:STRING, string]
  51.         i += string.size + 2
  52.      
  53.       # Here's the indentation magic!
  54.       #
  55.       # We have to take care of 3 cases:
  56.       #
  57.       #   if true:  # 1) the block is created
  58.       #     line 1
  59.       #     line 2  # 2) new line inside a block
  60.       #   continue  # 3) dedent
  61.       #
  62.       # This elsif takes care of the first case. The number of spaces will determine
  63.       # the indent level.
  64.       elsif indent = chunk[/\A\:\n( +)/m, 1] # Matches ": <newline> <spaces>"
  65.         # When we create a new block we expect the indent level to go up.
  66.         if indent.size <= current_indent
  67.           raise "Bad indent level, got #{indent.size} indents, " +
  68.                 "expected > #{current_indent}"
  69.         end
  70.         # Adjust the current indentation level.
  71.         current_indent = indent.size
  72.         indent_stack.push(current_indent)
  73.         tokens << [:INDENT, indent.size]
  74.         i += indent.size + 2
  75.  
  76.       # This elsif takes care of the two last cases:
  77.       # Case 2: We stay in the same block if the indent level (number of spaces) is the
  78.       #         same as current_indent.
  79.       # Case 3: Close the current block, if indent level is lower than current_indent.
  80.       elsif indent = chunk[/\A\n( *)/m, 1] # Matches "<newline> <spaces>"
  81.         if indent.size == current_indent # Case 2
  82.           # Nothing to do, we're still in the same block
  83.           tokens << [:NEWLINE, "\n"]
  84.         elsif indent.size < current_indent # Case 3
  85.           while indent.size < current_indent
  86.             indent_stack.pop
  87.             current_indent = indent_stack.last || 0
  88.             tokens << [:DEDENT, indent.size]
  89.           end
  90.           tokens << [:NEWLINE, "\n"]
  91.         else # indent.size > current_indent, error!
  92.           # Cannot increase indent level without using ":", so this is an error.
  93.           raise "Missing ':'"
  94.         end
  95.         i += indent.size + 1
  96.      
  97.       # Match long operators such as ||, &&, ==, !=, <= and >=.
  98.       # One character long operators are matched by the catch all `else` at the bottom.
  99.       elsif operator = chunk[/\A(\|\||&&|==|!=|<=|>=)/, 1]
  100.         tokens << [operator, operator]
  101.         i += operator.size
  102.      
  103.       # Ignore whitespace
  104.       elsif chunk.match(/\A /)
  105.         i += 1
  106.      
  107.       # Catch all single characters
  108.       # We treat all other single characters as a token. Eg.: ( ) , . ! + - <
  109.       else
  110.         value = chunk[0,1]
  111.         tokens << [value, value]
  112.         i += 1
  113.        
  114.       end
  115.      
  116.     end
  117.    
  118.     # Close all open blocks
  119.     while indent = indent_stack.pop
  120.       tokens << [:DEDENT, indent_stack.first || 0]
  121.     end
  122.    
  123.     tokens
  124.   end
  125. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement