Advertisement
Guest User

Untitled

a guest
Jul 14th, 2019
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Rust 1.75 KB | None | 0 0
  1. use regex;
  2.  
  3. struct Token {
  4.     type: str,
  5.     val: str
  6. }
  7.  
  8. fn tokenize(code: str, rules: dict) -> List[Token] {
  9.     let idx = 1;
  10.     let regex_parts = [];
  11.     let group_type = dict({});
  12.  
  13.     for type in rules {
  14.         let regex = rules[type];
  15.         let groupname = "GROUP{}".format(idx);
  16.         regex_parts.append("(?P<{}>{})".format(groupname, regex))
  17.         group_type[groupname] = type;
  18.         idx += 1;
  19.     }
  20.  
  21.     let tokens: List[Token] = [];
  22.  
  23.     while 1 {
  24.         let tok = None;
  25.  
  26.         let m = find("|".join(regex_parts), code);
  27.         if m {
  28.             let groupname = str(m.lastgroup);
  29.             let tok_type = group_type[groupname];
  30.             tok = Token();
  31.             tok.type = tok_type;
  32.             tok.val = m.group(groupname);
  33.             code = code[m.end():];
  34.         }
  35.         else {
  36.             break;
  37.         }
  38.  
  39.         if tok == None {
  40.             break;
  41.         }
  42.  
  43.         tokens += [tok];
  44.     }
  45.  
  46.     return tokens;
  47. }
  48.  
  49. fn lex(code: str) -> List[Token] {
  50.     let NUMBER = "((0x[0-9A-F]+)|([0-9]+))";
  51.  
  52.     let rules = dict({
  53.         "ELSE-IF": "else( )+if",
  54.         "INCLUDE": "#include\([a-zA-Z_][a-zA-Z_0-9\.]*:[a-zA-Z_][a-zA-Z_0-9]*\)",
  55.         "IMPORT": "use( )*([a-zA-Z_][a-zA-Z_0-9]*(::|))*;",
  56.         "DICT": "dict( )*\({|}\)",
  57.         "STR": "\"(\\\"|\\\\|[^\"\n])*?\"i?",
  58.         "ID": "[a-zA-Z_][a-zA-Z_0-9]*"
  59.     });
  60.    
  61.     rules["::"] = "::";
  62.     rules["->"] = "->";
  63.     rules["=="] = "==";
  64.  
  65.     rules["RANGE"] = NUMBER + "( )*\.\.( )*" + NUMBER + "|(\(.*\.\..*\))";
  66.     rules["NUMBER"] = NUMBER;
  67.  
  68.     for i in "@{<([])>}+-*/%;:.,=!" {
  69.         rules[i] = "\\" + i;
  70.     }
  71.  
  72.     rules["newline"] = "\n";
  73.     rules["whitespace"] = " |\t";
  74.  
  75.     return tokenize(code, rules);
  76. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement