Untitled

#include "bacon/Lexer/Lexer.h"
#include "bacon/Basic/TokenKind.h"
#include "cassert"

#define cur data[position]
#define curi data[position++]

using namespace bacon;

Lexer::Lexer() {
  std::string letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGRSTUVWXYZ_";
  for (std::string::iterator it = letters.begin(); it != letters.end(); it++)
    types[*it] = Letter;

  std::string numbers = "0123456789";
  for(std::string::iterator it = numbers.begin(); it != numbers.end(); it++)
    types[*it] = Number;

  std::string whitespaces = " \t\n\r";
  for(std::string::iterator it = whitespaces.begin();
      it != whitespaces.end(); it++)
    types[*it] = Whitespace;

  std::string eof("\0",1);
  for(std::string::iterator it = eof.begin(); it != eof.end(); it++)
    types[*it] = Eof;

  std::string symbols = "+-*%/^#=~<>(){}[];:,.\"";
  for(std::string::iterator it = symbols.begin(); it != symbols.end(); it++)
    types[*it] = Symbol;

#define KEYWORD(X) HashTable[#X] = kw_ ## X;
#include "bacon/Basic/TokenKind.def"
}

const SourceLocation Lexer::loc(size_t position) const {
  return SourceLocation(position, fileID);
}

const Token* Lexer::lexTokens(const SourceLocation start,
                              const llvm::StringRef &data) {
  fileID = start.getFileID();
  size_t position = start.getLocation();

  Token *tokens = new Token[data.size()];
  size_t index = 0;
  Token t;

  while (position < data.size())
    switch (curi) {
    case ' ':
    case '\n':
    case '\t':
      continue;

    case '(':
      tokens[index++] = Token(loc(position-1), 1, l_param);
      continue;
    case ')':
      tokens[index++] = Token(loc(position-1), 1, r_param);
      continue;
    case '{':
      tokens[index++] = Token(loc(position-1), 1, l_brace);
      continue;
    case '}':
      tokens[index++] = Token(loc(position-1), 1, r_brace);
      continue;
    case '[':
      tokens[index++] = Token(loc(position-1), 1, l_bracket);
      continue;
    case ']':
      tokens[index++] = Token(loc(position-1), 1, r_bracket);
      continue;
    case '.':
      tokens[index++] = Token(loc(position-1), 1, dot);
      continue;
    case ';':
      tokens[index++] = Token(loc(position-1), 1, seperator);
      continue;
    case '=':
      if (cur == '=')
        tokens[index++] = Token(loc(position++-1), 2, equal);
      else
        tokens[index++] = Token(loc(position-1), 1, assign);
      continue;
    case '+':
      tokens[index++] = Token(loc(position-1), 1, plus);
      continue;
    case '-':
      tokens[index++] = Token(loc(position-1), 1, minus);
      continue;
    case '*':
      tokens[index++] = Token(loc(position-1), 1, mul);
      continue;
    case '%':
      tokens[index++] = Token(loc(position-1), 1, mod);
      continue;
    case '^':
      tokens[index++] = Token(loc(position-1), 1, pow);
      continue;
    case '#':
      tokens[index++] = Token(loc(position-1), 1, sharp);
      continue;
    case '~':
      tokens[index++] = Token(loc(position-1), 1, tilde);
      continue;
    case '<':
      if (cur == '=')
        tokens[index++] = Token(loc(position++-1), 2, ltequal);
      else
        tokens[index++] = Token(loc(position-1), 1, lt);
      continue;
    case '>':
      if (cur == '=')
        tokens[index++] = Token(loc(position++-1), 2, gtequal);
      else
        tokens[index++] = Token(loc(position-1), 1, gt);
      continue;
    case ':':
      tokens[index++] = Token(loc(position-1), 1, colon);
      continue;
    case ',':
      tokens[index++] = Token(loc(position-1), 1, comma);
      continue;
    case '!':
      if (cur == '=')
        tokens[index++] = Token(loc(position++-1), 2, notequal);
      else
        assert(0 && "unimplemented character");
      continue;
    case '?':
      tokens[index++] = Token(loc(position-1), 1, question);
      continue;

    case '/':
      if (cur == '/') {
        while (curi != '\n');
        continue;
      }
      tokens[index++] = Token(loc(position-1), 1, div);
      continue;

    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
      {
        int i = position;
        while (types[cur] == Number)
          position++;
        tokens[index++] = Token(loc(i-1), position-(i-1), number_literal);
        continue;
      }

    case '\"':
      {
        unsigned int pos = position;

        // scan to the next " and skip in-line \"
        while(curi != '\"') {
          assert((int)cur && "Missing end \"");

          if(data[position-1] == '\\')
            position++;
        }

        tokens[index++] = Token(loc(pos-1),
                                position-(pos-1),
                                string_literal);
        continue;
      }

    case '\'':
      {
        unsigned int pos = position;

        // scan to the next '
        while(curi != '\'')
          assert((int)cur && "Missing end \'");

        tokens[index++] = Token(loc(pos-1),
                                position-(pos-1),
                                char_literal);
        continue;
      }

    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    case 'y': case 'z':
    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    case 'Y': case 'Z':
    case '_': {
      int i = position;
      while (types[cur] & (Letter | Number))
        position++;

      HashTableEntryTy &tok = HashTable.GetOrCreateValue(data.slice(i-1, position));
      Tok t = tok.getValue();
      if (t)
        tokens[index++] = Token(loc(i-1), position-(i-1), t);
      else {
        tok.setValue(identifier);
        tokens[index++] = Token(loc(i-1), position-(i-1), identifier);
      }
      continue;
    }

    case 0:
      tokens[index++] = Token(loc(position), 0, eof);
      goto Lend;

    default:
      printf("got a %c(%i) at %zu\n", data[position-1], (int)data[position-1], position);
      assert(0 && "unimplemented character");

    }
 Lend:
  return tokens;
}