Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "Lexer.hpp"
- namespace sage
- {
- namespace detail
- {
- // ----------------------------------------------------------------------------
- inline char getCharAndAdvance(const char*& ptr)
- {
- return *ptr++;
- }
- // ----------------------------------------------------------------------------
- inline bool isHorizontalWhitespace(char c)
- {
- return c == ' ' || c == '\t' || c == '\f' || c == '\v';
- }
- // ----------------------------------------------------------------------------
- inline bool isVerticalWhitespace(char c)
- {
- return c == '\r' || c == '\n';
- }
- // ----------------------------------------------------------------------------
- Lexer::Lexer(std::string input)
- : m_input(std::move(input))
- , m_bufferStart(m_input.data())
- , m_bufferEnd(m_input.data() + m_input.size())
- , m_bufferPtr(m_input.data())
- {
- }
- // ----------------------------------------------------------------------------
- bool Lexer::skipWhitespace(Token& result, const char* curPtr)
- {
- auto c = *curPtr;
- // Skip consecutive spaces.
- while(true) {
- if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
- // if we have something other than whitespace, we're done.
- break;
- }
- c = *++curPtr;
- }
- // If the client wants us to return whitespace, return it now.
- if(m_keepWhitespace) {
- formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
- return true;
- }
- m_bufferPtr = curPtr;
- return false;
- }
- // ----------------------------------------------------------------------------
- bool Lexer::skipLineComment(Token& result, const char* curPtr)
- {
- // Scan over the body of the comment. The common case, when scanning, is that
- // the comment contains normal ascii characters with nothing interesting in
- // them. As such, optimize for this case with the inner loop.
- //
- // This loop terminates with CurPtr pointing at the newline (or end of buffer)
- // character that ends the line comment.
- char C;
- while(true) {
- C = *curPtr;
- // Skip over characters in the fast loop.
- while(C != 0 && // Potentially EOF.
- C != '\n' && C != '\r') // Newline or DOS-style newline.
- C = *++curPtr;
- break;
- }
- // If we are returning comments as tokens, return this comment as a token.
- if(m_keepComments) {
- formTokenWithChars(result, curPtr, Token::Kind::Comment);
- return true;
- }
- // Otherwise, eat the \n character. We don't care if this is a \n\r or
- // \r\n sequence. This is an efficiency hack (because we know the \n can't
- // contribute to another token), it isn't needed for correctness. Note that
- // this is ok even in KeepWhitespaceMode, because we would have returned the
- // comment above in that mode.
- ++curPtr;
- m_bufferPtr = curPtr;
- return false;
- }
- // ----------------------------------------------------------------------------
- bool Lexer::lexIdentifier(Token& result, const char* curPtr)
- {
- // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
- unsigned char C = *curPtr++;
- while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') ||
- (C >= '0' && C <= '9') || C == '_')
- C = *curPtr++;
- --curPtr; // Back up over the skipped character.
- formTokenWithChars(result, curPtr, Token::Kind::Identifier);
- return true;
- }
- // ----------------------------------------------------------------------------
- bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
- {
- (void)result;
- (void)curPtr;
- throw std::exception("Not implemented yet");
- }
- // ----------------------------------------------------------------------------
- bool Lexer::lexStringLiteral(Token& result,
- const char* curPtr,
- bool isSingleQuoted)
- {
- auto const quote = isSingleQuoted ? '\'' : '"';
- char c = getCharAndAdvance(curPtr);
- while(c != quote) {
- // Skip escaped characters.
- if(c == '\\')
- c = getCharAndAdvance(curPtr);
- if(c == '\n' || c == '\r' ||
- (c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
- formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
- return true;
- }
- c = getCharAndAdvance(curPtr);
- }
- formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
- return true;
- }
- // ----------------------------------------------------------------------------
- void Lexer::formTokenWithChars(Token& result,
- const char* tokEnd,
- Token::Kind kind)
- {
- auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);
- result.setKind(kind);
- result.setLength(tokLen);
- result.setLocation(getSourceLocation(m_bufferPtr));
- result.setIdentifier(m_bufferPtr, tokLen);
- m_bufferPtr = tokEnd;
- }
- SourceLocation Lexer::getSourceLocation(const char* ptr)
- {
- auto loc = SourceLocation{};
- auto curPtr = m_bufferStart;
- loc.line = 1;
- while(curPtr < ptr) {
- if(isVerticalWhitespace(*curPtr++)) {
- loc.line++;
- loc.column = 0;
- } else {
- loc.column++;
- }
- }
- return loc;
- }
- // ----------------------------------------------------------------------------
- bool Lexer::lexToken(Token& result)
- {
- LexNextToken:
- const char* curPtr = m_bufferPtr;
- if(curPtr > m_bufferEnd)
- return false;
- // Small amounts of horizontal whitespace are very common between tokens.
- if((*curPtr == ' ') || (*curPtr == '\t')) {
- ++curPtr;
- while((*curPtr == ' ') || (*curPtr == '\t'))
- ++curPtr;
- if(m_keepWhitespace) {
- // If the user wants to keep whitespaces
- // form a new token and return
- formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
- return true;
- }
- m_bufferPtr = curPtr;
- }
- auto c = getCharAndAdvance(curPtr);
- auto kind = Token::Kind::Unknown;
- // Lex identifiers and constants
- if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
- return lexIdentifier(result, curPtr);
- if(c >= '0' && c <= '9')
- return lexNumericConstant(result, curPtr);
- switch(c) {
- case '\0':
- // Found EOF?
- if(curPtr - 1 == m_bufferEnd) {
- formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
- return true;
- }
- if(skipWhitespace(result, curPtr))
- return true;
- goto LexNextToken;
- case '\r':
- if(*curPtr == '\n')
- c = getCharAndAdvance(curPtr);
- [[fallthrough]];
- case '\n':
- if(skipWhitespace(result, curPtr))
- return true;
- // We only saw whitespace, so just try again
- goto LexNextToken;
- case ' ':
- case '\t':
- case '\f':
- case '\v':
- SkipHorizontalWhitespace:
- if(skipWhitespace(result, curPtr))
- return true;
- SkipIgnoredUnits:
- curPtr = m_bufferPtr;
- // If the next token is obviously a // or /* */ comment, skip it
- // efficiently too (without going through the big switch stmt).
- if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
- if(skipLineComment(result, curPtr + 2))
- return true;
- goto SkipIgnoredUnits;
- } else if(isHorizontalWhitespace(*curPtr)) {
- goto SkipHorizontalWhitespace;
- }
- // We only saw whitespace, so just try again
- // (We manually eliminate the tail call to avoid recursion.)
- goto LexNextToken;
- case '\'':
- return lexStringLiteral(result, curPtr, true);
- case '"':
- return lexStringLiteral(result, curPtr, false);
- case '?':
- kind = Token::Kind::Question;
- break;
- case '[':
- kind = Token::Kind::LeftSquare;
- break;
- case ']':
- kind = Token::Kind::RightSquare;
- break;
- case '(':
- kind = Token::Kind::LeftParen;
- break;
- case ')':
- kind = Token::Kind::RightParen;
- break;
- case '{':
- kind = Token::Kind::LeftCurly;
- break;
- case '}':
- kind = Token::Kind::RightCurly;
- break;
- case '.': {
- auto const next = *curPtr;
- if(next >= '0' && next <= '9') {
- return lexNumericConstant(result, curPtr);
- } else {
- kind = Token::Kind::Period;
- }
- }
- case '*':
- kind = Token::Kind::Star;
- break;
- case '+':
- kind = Token::Kind::Plus;
- break;
- case '-':
- kind = Token::Kind::Minus;
- break;
- case '!':
- kind = Token::Kind::Exclam;
- break;
- case '/':
- if(*curPtr == '/') {
- if(skipLineComment(result, curPtr + 1))
- return true; // There is a token to return.
- // It is common for the tokens immediately after a // comment to be
- // whitespace (indentation for the next line). Instead of going through
- // the big switch, handle it efficiently now.
- goto SkipIgnoredUnits;
- }
- kind = Token::Kind::Slash;
- break;
- case '>':
- kind = Token::Kind::Greater;
- break;
- case ':':
- kind = Token::Kind::Colon;
- break;
- case ';':
- kind = Token::Kind::SemiColon;
- break;
- case ',':
- kind = Token::Kind::Comma;
- break;
- case '#':
- kind = Token::Kind::Hash;
- break;
- default:
- kind = Token::Kind::Unknown;
- break;
- }
- formTokenWithChars(result, curPtr, kind);
- return true;
- }
- // ----------------------------------------------------------------------------
- std::vector<Token> Lexer::lexAllTokens()
- {
- auto result = std::vector<Token>{};
- auto token = Token{};
- while(lexToken(token)) {
- result.emplace_back(std::move(token));
- }
- return result;
- }
- // ----------------------------------------------------------------------------
- } // namespace detail
- } // namespace sage
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement