Untitled

#include "Lexer.hpp"

namespace sage
{
namespace detail
{
// ----------------------------------------------------------------------------
inline char getCharAndAdvance(const char*& ptr)
{
  return *ptr++;
}
// ----------------------------------------------------------------------------
inline bool isHorizontalWhitespace(char c)
{
  return c == ' ' || c == '\t' || c == '\f' || c == '\v';
}
// ----------------------------------------------------------------------------
inline bool isVerticalWhitespace(char c)
{
  return c == '\r' || c == '\n';
}
// ----------------------------------------------------------------------------
Lexer::Lexer(std::string input)
  : m_input(std::move(input))
  , m_bufferStart(m_input.data())
  , m_bufferEnd(m_input.data() + m_input.size())
  , m_bufferPtr(m_input.data())
{
}
// ----------------------------------------------------------------------------
bool Lexer::skipWhitespace(Token& result, const char* curPtr)
{
  auto c = *curPtr;

  // Skip consecutive spaces.
  while(true) {
    if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
      // if we have something other than whitespace, we're done.
      break;
    }

    c = *++curPtr;
  }

  // If the client wants us to return whitespace, return it now.
  if(m_keepWhitespace) {
    formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
    return true;
  }

  m_bufferPtr = curPtr;
  return false;
}
// ----------------------------------------------------------------------------
bool Lexer::skipLineComment(Token& result, const char* curPtr)
{
  // Scan over the body of the comment.  The common case, when scanning, is that
  // the comment contains normal ascii characters with nothing interesting in
  // them.  As such, optimize for this case with the inner loop.
  //
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
  // character that ends the line comment.
  char C;
  while(true) {
    C = *curPtr;
    // Skip over characters in the fast loop.
    while(C != 0 &&               // Potentially EOF.
          C != '\n' && C != '\r') // Newline or DOS-style newline.
      C = *++curPtr;

    break;
  }

  // If we are returning comments as tokens, return this comment as a token.
  if(m_keepComments) {
    formTokenWithChars(result, curPtr, Token::Kind::Comment);
    return true;
  }

  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
  // contribute to another token), it isn't needed for correctness.  Note that
  // this is ok even in KeepWhitespaceMode, because we would have returned the
  // comment above in that mode.
  ++curPtr;

  m_bufferPtr = curPtr;
  return false;
}
// ----------------------------------------------------------------------------
bool Lexer::lexIdentifier(Token& result, const char* curPtr)
{
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  unsigned char C = *curPtr++;
  while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') ||
        (C >= '0' && C <= '9') || C == '_')
    C = *curPtr++;

  --curPtr; // Back up over the skipped character.

  formTokenWithChars(result, curPtr, Token::Kind::Identifier);

  return true;
}
// ----------------------------------------------------------------------------
bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
{
  (void)result;
  (void)curPtr;
  throw std::exception("Not implemented yet");
}
// ----------------------------------------------------------------------------
bool Lexer::lexStringLiteral(Token&      result,
                             const char* curPtr,
                             bool        isSingleQuoted)
{
  auto const  quote        = isSingleQuoted ? '\'' : '"';

  char c = getCharAndAdvance(curPtr);
  while(c != quote) {
    // Skip escaped characters.
    if(c == '\\')
      c = getCharAndAdvance(curPtr);

    if(c == '\n' || c == '\r' ||
       (c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
      formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
      return true;
    }

    c = getCharAndAdvance(curPtr);
  }

  formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
  return true;
}
// ----------------------------------------------------------------------------
void Lexer::formTokenWithChars(Token&      result,
                               const char* tokEnd,
                               Token::Kind kind)
{
  auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);

  result.setKind(kind);
  result.setLength(tokLen);
  result.setLocation(getSourceLocation(m_bufferPtr));
  result.setIdentifier(m_bufferPtr, tokLen);

  m_bufferPtr = tokEnd;
}
SourceLocation Lexer::getSourceLocation(const char* ptr)
{
  auto loc    = SourceLocation{};
  auto curPtr = m_bufferStart;

  loc.line = 1;
  while(curPtr < ptr) {
    if(isVerticalWhitespace(*curPtr++)) {
      loc.line++;
      loc.column = 0;
    } else {
      loc.column++;
    }
  }

  return loc;
}
// ----------------------------------------------------------------------------
bool Lexer::lexToken(Token& result)
{
LexNextToken:
  const char* curPtr = m_bufferPtr;

  if(curPtr > m_bufferEnd)
    return false;

  // Small amounts of horizontal whitespace are very common between tokens.
  if((*curPtr == ' ') || (*curPtr == '\t')) {
    ++curPtr;
    while((*curPtr == ' ') || (*curPtr == '\t'))
      ++curPtr;

    if(m_keepWhitespace) {
      // If the user wants to keep whitespaces
      // form a new token and return
      formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
      return true;
    }

    m_bufferPtr = curPtr;
  }

  auto c    = getCharAndAdvance(curPtr);
  auto kind = Token::Kind::Unknown;

  // Lex identifiers and constants
  if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
    return lexIdentifier(result, curPtr);
  if(c >= '0' && c <= '9')
    return lexNumericConstant(result, curPtr);

  switch(c) {
    case '\0':
      // Found EOF?
      if(curPtr - 1 == m_bufferEnd) {
        formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
        return true;
      }

      if(skipWhitespace(result, curPtr))
        return true;

      goto LexNextToken;
    case '\r':
      if(*curPtr == '\n')
        c = getCharAndAdvance(curPtr);
      [[fallthrough]];
    case '\n':
      if(skipWhitespace(result, curPtr))
        return true;

      // We only saw whitespace, so just try again
      goto LexNextToken;
    case ' ':
    case '\t':
    case '\f':
    case '\v':
    SkipHorizontalWhitespace:
      if(skipWhitespace(result, curPtr))
        return true;
    SkipIgnoredUnits:
      curPtr = m_bufferPtr;
      // If the next token is obviously a // or /* */ comment, skip it
      // efficiently too (without going through the big switch stmt).
      if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
        if(skipLineComment(result, curPtr + 2))
          return true;
        goto SkipIgnoredUnits;
      } else if(isHorizontalWhitespace(*curPtr)) {
        goto SkipHorizontalWhitespace;
      }
      // We only saw whitespace, so just try again
      // (We manually eliminate the tail call to avoid recursion.)
      goto LexNextToken;
    case '\'':
      return lexStringLiteral(result, curPtr, true);
    case '"':
      return lexStringLiteral(result, curPtr, false);
    case '?':
      kind = Token::Kind::Question;
      break;
    case '[':
      kind = Token::Kind::LeftSquare;
      break;
    case ']':
      kind = Token::Kind::RightSquare;
      break;
    case '(':
      kind = Token::Kind::LeftParen;
      break;
    case ')':
      kind = Token::Kind::RightParen;
      break;
    case '{':
      kind = Token::Kind::LeftCurly;
      break;
    case '}':
      kind = Token::Kind::RightCurly;
      break;
    case '.': {
      auto const next = *curPtr;
      if(next >= '0' && next <= '9') {
        return lexNumericConstant(result, curPtr);
      } else {
        kind = Token::Kind::Period;
      }
    }
    case '*':
      kind = Token::Kind::Star;
      break;
    case '+':
      kind = Token::Kind::Plus;
      break;
    case '-':
      kind = Token::Kind::Minus;
      break;
    case '!':
      kind = Token::Kind::Exclam;
      break;
    case '/':
      if(*curPtr == '/') {
        if(skipLineComment(result, curPtr + 1))
          return true; // There is a token to return.

        // It is common for the tokens immediately after a // comment to be
        // whitespace (indentation for the next line).  Instead of going through
        // the big switch, handle it efficiently now.
        goto SkipIgnoredUnits;
      }
      kind = Token::Kind::Slash;
      break;
    case '>':
      kind = Token::Kind::Greater;
      break;
    case ':':
      kind = Token::Kind::Colon;
      break;
    case ';':
      kind = Token::Kind::SemiColon;
      break;
    case ',':
      kind = Token::Kind::Comma;
      break;
    case '#':
      kind = Token::Kind::Hash;
      break;
    default:
      kind = Token::Kind::Unknown;
      break;
  }
  formTokenWithChars(result, curPtr, kind);
  return true;
}
// ----------------------------------------------------------------------------
std::vector<Token> Lexer::lexAllTokens()
{
  auto result = std::vector<Token>{};
  auto token  = Token{};
  while(lexToken(token)) {
    result.emplace_back(std::move(token));
  }
  return result;
}
// ----------------------------------------------------------------------------
} // namespace detail
} // namespace sage