Advertisement
Guest User

Lox Lexer in C

a guest
Apr 21st, 2024
181
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 10.92 KB | Source Code | 0 0
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <stdbool.h>
  4. #include <string.h>
  5. #include <ctype.h>
  6.  
  7. #define INITIAL_TOKEN_ARRAY_SIZE 1024
  8. #define MAX_IDENTIFIER_SIZE 1024
  9. #define MAX_TOKENTYPE_NAME_SIZE 32
  10.  
  11. typedef enum TokenType {
  12.     //Single-character tokens.
  13.     TOKEN_LEFT_PAREN,
  14.     TOKEN_RIGHT_PAREN,
  15.     TOKEN_LEFT_BRACE,
  16.     TOKEN_RIGHT_BRACE,
  17.     TOKEN_COMMA,
  18.     TOKEN_DOT,
  19.     TOKEN_MINUS,
  20.     TOKEN_PLUS,
  21.     TOKEN_SEMICOLON,
  22.     TOKEN_SLASH,
  23.     TOKEN_STAR,
  24.  
  25.     //One or two character tokens.
  26.     TOKEN_BANG,
  27.     TOKEN_BANG_EQUAL,
  28.     TOKEN_EQUAL,
  29.     TOKEN_EQUAL_EQUAL,
  30.     TOKEN_GREATER,
  31.     TOKEN_GREATER_EQUAL,
  32.     TOKEN_LESS,
  33.     TOKEN_LESS_EQUAL,
  34.  
  35.     //Literals.
  36.     TOKEN_IDENTIFIER,
  37.     TOKEN_STRING,
  38.     TOKEN_NUMBER,
  39.  
  40.     //Keywords.
  41.     TOKEN_AND,
  42.     TOKEN_CLASS,
  43.     TOKEN_ELSE,
  44.     TOKEN_FALSE,
  45.     TOKEN_FUN,
  46.     TOKEN_FOR,
  47.     TOKEN_IF,
  48.     TOKEN_NIL,
  49.     TOKEN_OR,
  50.     TOKEN_PRINT,
  51.     TOKEN_RETURN,
  52.     TOKEN_SUPER,
  53.     TOKEN_THIS,
  54.     TOKEN_TRUE,
  55.     TOKEN_VAR,
  56.     TOKEN_WHILE,
  57.  
  58.     //EOF.
  59.     TOKEN_EOF
  60. } TokenType;
  61.  
  62. typedef struct Token {
  63.     TokenType type;
  64.     char lexeme[MAX_IDENTIFIER_SIZE];
  65. } Token;
  66.  
  67. Token token_create(TokenType, const char*);
  68. void token_free(Token*);
  69. void token_name(TokenType, char*);
  70. void print_token(const Token);
  71.  
  72. Token token_create(TokenType type, const char* lexeme) {
  73.     Token token;
  74.     token.type = type;
  75.     strncpy(token.lexeme, lexeme, MAX_IDENTIFIER_SIZE);
  76.     return token;
  77. }
  78.  
  79. void token_free(Token* token) {
  80.     free(token);
  81. }
  82.  
  83. void token_name(TokenType type, char* buffer) {
  84.     switch (type) {
  85.         case TOKEN_LEFT_PAREN: strcpy(buffer, "TOKEN_LEFT_PAREN"); break;
  86.         case TOKEN_RIGHT_PAREN: strcpy(buffer, "TOKEN_RIGHT_PAREN"); break;
  87.         case TOKEN_LEFT_BRACE: strcpy(buffer, "TOKEN_LEFT_BRACE"); break;
  88.         case TOKEN_RIGHT_BRACE: strcpy(buffer, "TOKEN_RIGHT_BRACE"); break;
  89.         case TOKEN_COMMA: strcpy(buffer, "TOKEN_COMMA"); break;
  90.         case TOKEN_DOT: strcpy(buffer, "TOKEN_DOT"); break;
  91.         case TOKEN_MINUS: strcpy(buffer, "TOKEN_MINUS"); break;
  92.         case TOKEN_PLUS: strcpy(buffer, "TOKEN_PLUS"); break;
  93.         case TOKEN_SEMICOLON: strcpy(buffer, "TOKEN_SEMICOLON"); break;
  94.         case TOKEN_SLASH: strcpy(buffer, "TOKEN_SLASH"); break;
  95.         case TOKEN_STAR: strcpy(buffer, "TOKEN_STAR"); break;
  96.         case TOKEN_BANG: strcpy(buffer, "TOKEN_BANG"); break;
  97.         case TOKEN_BANG_EQUAL: strcpy(buffer, "TOKEN_BANG_EQUAL"); break;
  98.         case TOKEN_EQUAL: strcpy(buffer, "TOKEN_EQUAL"); break;
  99.         case TOKEN_EQUAL_EQUAL: strcpy(buffer, "TOKEN_EQUAL_EQUAL"); break;
  100.         case TOKEN_GREATER: strcpy(buffer, "TOKEN_GREATER"); break;
  101.         case TOKEN_GREATER_EQUAL: strcpy(buffer, "TOKEN_GREATER_EQUAL"); break;
  102.         case TOKEN_LESS: strcpy(buffer, "TOKEN_LESS"); break;
  103.         case TOKEN_LESS_EQUAL: strcpy(buffer, "TOKEN_LESS_EQUAL"); break;
  104.         case TOKEN_IDENTIFIER: strcpy(buffer, "TOKEN_IDENTIFIER"); break;
  105.         case TOKEN_STRING: strcpy(buffer, "TOKEN_STRING"); break;
  106.         case TOKEN_NUMBER: strcpy(buffer, "TOKEN_NUMBER"); break;
  107.         case TOKEN_AND: strcpy(buffer, "TOKEN_AND"); break;
  108.         case TOKEN_CLASS: strcpy(buffer, "TOKEN_CLASS"); break;
  109.         case TOKEN_ELSE: strcpy(buffer, "TOKEN_ELSE"); break;
  110.         case TOKEN_FALSE: strcpy(buffer, "TOKEN_FALSE"); break;
  111.         case TOKEN_FUN: strcpy(buffer, "TOKEN_FUN"); break;
  112.         case TOKEN_FOR: strcpy(buffer, "TOKEN_FOR"); break;
  113.         case TOKEN_IF: strcpy(buffer, "TOKEN_IF"); break;
  114.         case TOKEN_NIL: strcpy(buffer, "TOKEN_NIL"); break;
  115.         case TOKEN_OR: strcpy(buffer, "TOKEN_OR"); break;
  116.         case TOKEN_PRINT: strcpy(buffer, "TOKEN_PRINT"); break;
  117.         case TOKEN_RETURN: strcpy(buffer, "TOKEN_RETURN"); break;
  118.         case TOKEN_SUPER: strcpy(buffer, "TOKEN_SUPER"); break;
  119.         case TOKEN_THIS: strcpy(buffer, "TOKEN_THIS"); break;
  120.         case TOKEN_TRUE: strcpy(buffer, "TOKEN_TRUE"); break;
  121.         case TOKEN_VAR: strcpy(buffer, "TOKEN_VAR"); break;
  122.         case TOKEN_WHILE: strcpy(buffer, "TOKEN_WHILE"); break;
  123.         case TOKEN_EOF: strcpy(buffer, "TOKEN_EOF"); break;
  124.     }
  125. }
  126.  
  127. void print_token(const Token token) {
  128.     char name[MAX_TOKENTYPE_NAME_SIZE];
  129.     token_name(token.type, name);
  130.  
  131.     if (token.type == TOKEN_EOF) {
  132.         printf("%s", name);
  133.     }
  134.     else {
  135.         printf("%s: %s", name, token.lexeme);
  136.     }
  137. }
  138.  
  139. typedef struct Lexer {
  140.     const char* source;
  141.     size_t start;
  142.     size_t pos;
  143.     char current;
  144.  
  145.     size_t token_capacity;
  146.     Token* tokens;
  147.     size_t token_count;
  148. } Lexer;
  149.  
  150. Lexer* lexer_init(const char*);
  151. void lexer_free(Lexer*);
  152. bool lexer_is_at_end(Lexer*);
  153. void lexer_advance(Lexer*);
  154. char lexer_peek(Lexer*);
  155. char lexer_peek_next(Lexer*);
  156. bool lexer_match(Lexer*, char);
  157. bool lexer_advance_identifier(Lexer*);
  158. bool lexer_append_token(Lexer*, Token);
  159. void lexer_token_create(Lexer*, TokenType);
  160. void lexer_next_token(Lexer* lexer);
  161.  
  162. Lexer* lexer_init(const char* source) {
  163.     Lexer* lexer = malloc(sizeof(Lexer));
  164.     lexer->source = source;
  165.     lexer->start = 0;
  166.     lexer->pos = 0;
  167.     lexer->current = '\0';
  168.     lexer->token_capacity = INITIAL_TOKEN_ARRAY_SIZE;
  169.     lexer->tokens = malloc(lexer->token_capacity * sizeof(Token));
  170.     lexer->token_count = 0;
  171.     return lexer;
  172. }
  173.  
  174. void lexer_free(Lexer* lexer) {
  175.     free((char*) lexer->source);
  176.     free(lexer->tokens);
  177.     free(lexer);
  178. }
  179.  
  180. bool lexer_is_at_end(Lexer* lexer) {
  181.     return lexer->pos >= strlen(lexer->source);
  182. }
  183.  
  184. void lexer_advance(Lexer* lexer) {
  185.     lexer->current = lexer->source[lexer->pos];
  186.     lexer->pos++;
  187. }
  188.  
  189. char lexer_peek(Lexer* lexer) {
  190.     return lexer_is_at_end(lexer) ? EOF : lexer->source[lexer->pos];
  191. }
  192.  
  193. char lexer_peek_next(Lexer* lexer) {
  194.     return lexer->pos + 1 >= strlen(lexer->source) ? EOF : lexer->source[lexer->pos + 1];
  195. }
  196.  
  197. bool lexer_match(Lexer* lexer, char c) {
  198.     if (lexer_peek(lexer) == c) {
  199.         lexer_advance(lexer);
  200.         return true;
  201.     }
  202.     return false;
  203. }
  204.  
  205. bool lexer_advance_identifier(Lexer* lexer) {
  206.     lexer->start = (lexer->pos - 1);
  207.     size_t size = 0;
  208.     while (isalnum(lexer_peek(lexer)) || lexer_peek(lexer) == '_') {
  209.         size++;
  210.         if (size >= MAX_IDENTIFIER_SIZE) { // >= Because we need an extra space for the '\0'.
  211.             return false;
  212.         }
  213.         lexer_advance(lexer);
  214.     }
  215.  
  216.     lexer_token_create(lexer, TOKEN_IDENTIFIER);
  217.     return true;
  218. }
  219.  
  220. bool lexer_append_token(Lexer* lexer, Token token) {
  221.     if (lexer->token_count >= lexer->token_capacity) {
  222.         lexer->token_capacity += INITIAL_TOKEN_ARRAY_SIZE;
  223.         Token* new_array = realloc(lexer->tokens, lexer->token_capacity * sizeof(Token));
  224.         if (new_array == NULL) {
  225.             return false;
  226.         }
  227.  
  228.         lexer->tokens = new_array;
  229.     }
  230.  
  231.     lexer->tokens[lexer->token_count] = token;
  232.     lexer->token_count++;
  233.     return true;
  234. }
  235.  
  236. void lexer_token_create(Lexer* lexer, TokenType type) {
  237.     const size_t lexeme_size = lexer->pos - lexer->start;
  238.     char lexeme[lexeme_size + 1];
  239.     for (size_t i = 0; i < lexeme_size; i++) {
  240.         lexeme[i] = lexer->source[lexer->start + i];
  241.     }
  242.     lexeme[lexeme_size] = '\0';
  243.  
  244.     Token token = token_create(type, lexeme);
  245.     bool result = lexer_append_token(lexer, token);
  246.     if (!result) {
  247.         fprintf(stderr, "Error adding Token to Token list.\n");
  248.         exit(EXIT_FAILURE);
  249.     }
  250. }
  251.  
  252. void lexer_next_token(Lexer* lexer) {
  253.     lexer->start = lexer->pos;
  254.     lexer_advance(lexer);
  255.     const char current = lexer->current;
  256.  
  257.     if      (current == '+') lexer_token_create(lexer, TOKEN_PLUS);
  258.     else if (current == '-') lexer_token_create(lexer, TOKEN_MINUS);
  259.     else if (current == '*') lexer_token_create(lexer, TOKEN_STAR);
  260.     else if (current == '.') lexer_token_create(lexer, TOKEN_DOT);
  261.     else if (current == ',') lexer_token_create(lexer, TOKEN_COMMA);
  262.     else if (current == ';') lexer_token_create(lexer, TOKEN_SEMICOLON);
  263.     else if (current == '(') lexer_token_create(lexer, TOKEN_LEFT_PAREN);
  264.     else if (current == ')') lexer_token_create(lexer, TOKEN_RIGHT_PAREN);
  265.     else if (current == '{') lexer_token_create(lexer, TOKEN_LEFT_BRACE);
  266.     else if (current == '}') lexer_token_create(lexer, TOKEN_RIGHT_BRACE);
  267.  
  268.     else if (current == '=') {
  269.         if (lexer_match(lexer, '=')) lexer_token_create(lexer, TOKEN_EQUAL_EQUAL);
  270.         else lexer_token_create(lexer, TOKEN_EQUAL);
  271.     }
  272.     else if (current == '!') {
  273.         if (lexer_match(lexer, '=')) lexer_token_create(lexer, TOKEN_BANG_EQUAL);
  274.         else lexer_token_create(lexer, TOKEN_BANG);
  275.     }
  276.     else if (current == '>') {
  277.         if (lexer_match(lexer, '=')) lexer_token_create(lexer, TOKEN_GREATER_EQUAL);
  278.         else lexer_token_create(lexer, TOKEN_GREATER);
  279.     }
  280.     else if (current == '<') {
  281.         if (lexer_match(lexer, '=')) lexer_token_create(lexer, TOKEN_LESS_EQUAL);
  282.         else lexer_token_create(lexer, TOKEN_LESS);
  283.     }
  284.  
  285.     else if (isalpha(current)) {
  286.         const bool result = lexer_advance_identifier(lexer);
  287.         if (!result) {
  288.             fprintf(stderr, "Error trying to read an identifier larger than maximum size. (%d)", MAX_IDENTIFIER_SIZE);
  289.             exit(EXIT_FAILURE);
  290.         }
  291.     }
  292.  
  293.     else if (isspace(current)) {
  294.         while (isspace(lexer_peek(lexer))) {
  295.             lexer_advance(lexer);
  296.         }
  297.     }
  298.  
  299.     else {
  300.         fprintf(stderr, "Unknown char '%c' in source file.\n", current);
  301.         exit(EXIT_FAILURE);
  302.     }
  303. }
  304.  
  305. size_t file_size(FILE* file);
  306. FILE* get_file(const char*);
  307. char* slurp_file(const char*);
  308.  
  309. size_t file_size(FILE* file) {
  310.     fseek(file, 0, SEEK_END);
  311.     size_t size = ftell(file);
  312.     rewind(file);
  313.     return size;
  314. }
  315.  
  316. FILE* get_file(const char* path) {
  317.     FILE* file = fopen(path, "r");
  318.     if (file == NULL) { // I know this is dumb.
  319.         return NULL;
  320.     }
  321.     return file;
  322. }
  323.  
  324. char* slurp_file(const char* path) {
  325.     FILE* file = get_file(path);
  326.     if (file == NULL) {
  327.         fprintf(stderr, "Couldn't open file '%s'.\n", path);
  328.         return NULL;
  329.     }
  330.  
  331.     const size_t size = file_size(file);
  332.     char* buffer = malloc(size * sizeof(char));
  333.     if (buffer == NULL) {
  334.         fprintf(stderr, "Couldn't allocate memory for file source.");
  335.         fclose(file);
  336.         return NULL;
  337.     }
  338.  
  339.     fread(buffer, sizeof(char), size, file);
  340.  
  341.     fclose(file);
  342.     return buffer;
  343. }
  344.  
  345. int main(int argc, char** argv) {
  346.     const char* source = slurp_file("code.txt");
  347.     if (source == NULL) {
  348.         return EXIT_FAILURE;
  349.     }
  350.  
  351.     Lexer* lexer = lexer_init(source);
  352.     while (!lexer_is_at_end(lexer)) {
  353.         lexer_next_token(lexer);
  354.     }
  355.     lexer_token_create(lexer, TOKEN_EOF);
  356.  
  357.     for (size_t i = 0; i < lexer->token_count; i++) {
  358.         print_token(lexer->tokens[i]);
  359.         putchar('\n');
  360.     }
  361.  
  362.     lexer_free(lexer);
  363.     return EXIT_SUCCESS;
  364. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement