Advertisement
GoofyAsmodeus

Tokenization

Jan 9th, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Rust 3.19 KB | None | 0 0
  1. // Our test program we're tokenizing. We could also
  2. // read a String from a file and use that, but this
  3. // is best for testing purposes.
  4. static SOURCE: &'static /*' <- ignore*/ str =
  5. "int main(void)
  6. {
  7.    int a = 2;
  8.    a = a * 2;
  9.    return 0;
  10. }";
  11.  
  12. fn main() {
  13.     println!("{:?}", tokenize(SOURCE));
  14. }
  15.  
  16. #[derive(Debug)]
  17. enum Keyword {
  18.     Int,        // int
  19.     Void,       // void
  20.     Return,     // return
  21. }
  22.  
  23. #[derive(Debug)]
  24. enum Symbol {
  25.     LParen,     // (
  26.     RParen,     // )
  27.     LBrace,     // {
  28.     RBrace,     // }
  29.     Assign,     // =
  30.     Semicolon,  // ;
  31.     Multiply,   // *
  32. }
  33.  
  34. // The token definition.
  35. #[derive(Debug)]
  36. enum Token {
  37.     Keyword(Keyword),
  38.     Symbol(Symbol),
  39.     Identifier(String),
  40.     Integer(i32),
  41. }
  42.  
  43. fn tokenize(source: &str) -> Vec<Token> {
  44.     let mut tokens = Vec::<Token>::new();
  45.     // Every character (including whitespace) in the source code.
  46.     let mut chars = source.chars().peekable();
  47.  
  48.     while let Some(c) = chars.next() {
  49.         match c {
  50.             // If the character we've just read is a symbol,
  51.             // then we make that a single token.
  52.             '(' => tokens.push(Token::Symbol(Symbol::LParen)),
  53.             ')' => tokens.push(Token::Symbol(Symbol::RParen)),
  54.             '{' => tokens.push(Token::Symbol(Symbol::LBrace)),
  55.             '}' => tokens.push(Token::Symbol(Symbol::RBrace)),
  56.             '=' => tokens.push(Token::Symbol(Symbol::Assign)),
  57.             ';' => tokens.push(Token::Symbol(Symbol::Semicolon)),
  58.             '*' => tokens.push(Token::Symbol(Symbol::Multiply)),
  59.             _ => {
  60.                 // If the character we read is numerical.
  61.                 if c.is_numeric() {
  62.                     let mut full_num = String::new();
  63.                     full_num.push(c);
  64.  
  65.                     // We read every consecutive digit following (if any).
  66.                     while chars.peek().unwrap().is_numeric() {
  67.                         full_num.push(chars.next().unwrap());
  68.                     }
  69.  
  70.                     // We convert the String to an integer and add the token.
  71.                     let num = full_num.as_str().parse::<i32>().unwrap();
  72.                     tokens.push(Token::Integer(num));
  73.                 }
  74.                 else if c.is_alphabetic() {
  75.                     let mut full_word = String::new();
  76.                     full_word.push(c);
  77.  
  78.                     // We read every consecutive character following (if any).
  79.                     while chars.peek().unwrap().is_alphabetic() {
  80.                         full_word.push(chars.next().unwrap());
  81.                     }
  82.  
  83.                     // The whole word we've built can be a keyword so we check before
  84.                     // we go to the assumption its an identifier for something.
  85.                     match full_word.as_str() {
  86.                         "int" => tokens.push(Token::Keyword(Keyword::Int)),
  87.                         "void" => tokens.push(Token::Keyword(Keyword::Void)),
  88.                         "return" => tokens.push(Token::Keyword(Keyword::Return)),
  89.                         _ => tokens.push(Token::Identifier(full_word)),
  90.                     }
  91.                 }
  92.             }
  93.         }
  94.     }
  95.     tokens
  96. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement