Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "bacon/Lexer/Lexer.h"
- #include "bacon/Basic/TokenKind.h"
- #include "cassert"
- #define cur data[position]
- #define curi data[position++]
- using namespace bacon;
- Lexer::Lexer() {
- std::string letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGRSTUVWXYZ_";
- for (std::string::iterator it = letters.begin(); it != letters.end(); it++)
- types[*it] = Letter;
- std::string numbers = "0123456789";
- for(std::string::iterator it = numbers.begin(); it != numbers.end(); it++)
- types[*it] = Number;
- std::string whitespaces = " \t\n\r";
- for(std::string::iterator it = whitespaces.begin();
- it != whitespaces.end(); it++)
- types[*it] = Whitespace;
- std::string eof("\0",1);
- for(std::string::iterator it = eof.begin(); it != eof.end(); it++)
- types[*it] = Eof;
- std::string symbols = "+-*%/^#=~<>(){}[];:,.\"";
- for(std::string::iterator it = symbols.begin(); it != symbols.end(); it++)
- types[*it] = Symbol;
- #define KEYWORD(X) HashTable[#X] = kw_ ## X;
- #include "bacon/Basic/TokenKind.def"
- }
- const SourceLocation Lexer::loc(size_t position) const {
- return SourceLocation(position, fileID);
- }
- const Token* Lexer::lexTokens(const SourceLocation start,
- const llvm::StringRef &data) {
- fileID = start.getFileID();
- size_t position = start.getLocation();
- Token *tokens = new Token[data.size()];
- size_t index = 0;
- Token t;
- while (position < data.size())
- switch (curi) {
- case ' ':
- case '\n':
- case '\t':
- continue;
- case '(':
- tokens[index++] = Token(loc(position-1), 1, l_param);
- continue;
- case ')':
- tokens[index++] = Token(loc(position-1), 1, r_param);
- continue;
- case '{':
- tokens[index++] = Token(loc(position-1), 1, l_brace);
- continue;
- case '}':
- tokens[index++] = Token(loc(position-1), 1, r_brace);
- continue;
- case '[':
- tokens[index++] = Token(loc(position-1), 1, l_bracket);
- continue;
- case ']':
- tokens[index++] = Token(loc(position-1), 1, r_bracket);
- continue;
- case '.':
- tokens[index++] = Token(loc(position-1), 1, dot);
- continue;
- case ';':
- tokens[index++] = Token(loc(position-1), 1, seperator);
- continue;
- case '=':
- if (cur == '=')
- tokens[index++] = Token(loc(position++-1), 2, equal);
- else
- tokens[index++] = Token(loc(position-1), 1, assign);
- continue;
- case '+':
- tokens[index++] = Token(loc(position-1), 1, plus);
- continue;
- case '-':
- tokens[index++] = Token(loc(position-1), 1, minus);
- continue;
- case '*':
- tokens[index++] = Token(loc(position-1), 1, mul);
- continue;
- case '%':
- tokens[index++] = Token(loc(position-1), 1, mod);
- continue;
- case '^':
- tokens[index++] = Token(loc(position-1), 1, pow);
- continue;
- case '#':
- tokens[index++] = Token(loc(position-1), 1, sharp);
- continue;
- case '~':
- tokens[index++] = Token(loc(position-1), 1, tilde);
- continue;
- case '<':
- if (cur == '=')
- tokens[index++] = Token(loc(position++-1), 2, ltequal);
- else
- tokens[index++] = Token(loc(position-1), 1, lt);
- continue;
- case '>':
- if (cur == '=')
- tokens[index++] = Token(loc(position++-1), 2, gtequal);
- else
- tokens[index++] = Token(loc(position-1), 1, gt);
- continue;
- case ':':
- tokens[index++] = Token(loc(position-1), 1, colon);
- continue;
- case ',':
- tokens[index++] = Token(loc(position-1), 1, comma);
- continue;
- case '!':
- if (cur == '=')
- tokens[index++] = Token(loc(position++-1), 2, notequal);
- else
- assert(0 && "unimplemented character");
- continue;
- case '?':
- tokens[index++] = Token(loc(position-1), 1, question);
- continue;
- case '/':
- if (cur == '/') {
- while (curi != '\n');
- continue;
- }
- tokens[index++] = Token(loc(position-1), 1, div);
- continue;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- {
- int i = position;
- while (types[cur] == Number)
- position++;
- tokens[index++] = Token(loc(i-1), position-(i-1), number_literal);
- continue;
- }
- case '\"':
- {
- unsigned int pos = position;
- // scan to the next " and skip in-line \"
- while(curi != '\"') {
- assert((int)cur && "Missing end \"");
- if(data[position-1] == '\\')
- position++;
- }
- tokens[index++] = Token(loc(pos-1),
- position-(pos-1),
- string_literal);
- continue;
- }
- case '\'':
- {
- unsigned int pos = position;
- // scan to the next '
- while(curi != '\'')
- assert((int)cur && "Missing end \'");
- tokens[index++] = Token(loc(pos-1),
- position-(pos-1),
- char_literal);
- continue;
- }
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
- case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
- case 's': case 't': case 'u': case 'v': case 'w': case 'x':
- case 'y': case 'z':
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
- case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
- case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
- case 'Y': case 'Z':
- case '_': {
- int i = position;
- while (types[cur] & (Letter | Number))
- position++;
- HashTableEntryTy &tok = HashTable.GetOrCreateValue(data.slice(i-1, position));
- Tok t = tok.getValue();
- if (t)
- tokens[index++] = Token(loc(i-1), position-(i-1), t);
- else {
- tok.setValue(identifier);
- tokens[index++] = Token(loc(i-1), position-(i-1), identifier);
- }
- continue;
- }
- case 0:
- tokens[index++] = Token(loc(position), 0, eof);
- goto Lend;
- default:
- printf("got a %c(%i) at %zu\n", data[position-1], (int)data[position-1], position);
- assert(0 && "unimplemented character");
- }
- Lend:
- return tokens;
- }
Add Comment
Please, Sign In to add comment