Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class InputFile {
- std::wstring filename;
- public:
- InputFile(const std::wstring& argfilename)
- : filename(argfilename) {}
- LexedFile Lex() {
- LexedFile l;
- auto FileHandle = CreateFile(
- filename.c_str(),
- GENERIC_READ,
- FILE_SHARE_READ,
- nullptr,
- OPEN_EXISTING,
- FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN,
- 0
- );
- std::vector<wchar_t> buffer;
- LARGE_INTEGER size;
- GetFileSizeEx(FileHandle, &size);
- buffer.resize(size.QuadPart);
- DWORD read;
- ReadFile(
- FileHandle,
- &buffer[0],
- size.QuadPart,
- &read,
- nullptr
- );
- CloseHandle(FileHandle);
- StreamInput input(&buffer);
- //std::wifstream input(filename, std::ios::in);
- std::vector<wchar_t> stack;
- wchar_t current = 0;
- static const std::unordered_map<std::wstring, LexedFile::Token> reserved_words(
- []() -> std::unordered_map<std::wstring, LexedFile::Token> {
- std::unordered_map<std::wstring, LexedFile::Token> retval;
- retval[L"namespace"] = LexedFile::Token::Namespace;
- retval[L"for"] = LexedFile::Token::For;
- retval[L"while"] = LexedFile::Token::While;
- retval[L"do"] = LexedFile::Token::Do;
- retval[L"switch"] = LexedFile::Token::Switch;
- retval[L"case"] = LexedFile::Token::Case;
- retval[L"default"] = LexedFile::Token::Default;
- retval[L"try"] = LexedFile::Token::Try;
- retval[L"catch"] = LexedFile::Token::Catch;
- retval[L"auto"] = LexedFile::Token::Auto;
- retval[L"type"] = LexedFile::Token::Type;
- retval[L"break"] = LexedFile::Token::Break;
- retval[L"continue"] = LexedFile::Token::Continue;
- retval[L"return"] = LexedFile::Token::Return;
- retval[L"static"] = LexedFile::Token::Static;
- retval[L"sizeof"] = LexedFile::Token::Sizeof;
- retval[L"decltype"] = LexedFile::Token::Decltype;
- retval[L"if"] = LexedFile::Token::If;
- retval[L"else"] = LexedFile::Token::Else;
- return retval;
- }()
- );
- // Dumps the stack, looking for reserved words or identifiers.
- auto next_no_token = [&] {
- if (stack.empty()) {
- return;
- }
- std::wstring token(stack.begin(), stack.end());
- stack.clear();
- if (reserved_words.find(token) != reserved_words.end()) {
- l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
- return;
- }
- std::wregex regex(L"^[_A-Za-z]\\w*$");
- if (!std::regex_match(token, regex)) {
- __debugbreak();
- throw std::runtime_error("Malformed identifier");
- }
- l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
- };
- auto next = [&](std::wstring name, Wide::LexedFile::Token t) {
- if (stack.empty()) {
- l.tokens.push_back(std::make_pair(std::move(name), t));
- return;
- }
- std::wstring token(stack.begin(), stack.end());
- stack.clear();
- if (reserved_words.find(token) != reserved_words.end()) {
- l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
- return;
- }
- std::wregex regex(L"^[_A-Za-z]\\w*$");
- if (!std::regex_match(token, regex)) {
- __debugbreak();
- throw std::runtime_error("Malformed identifier");
- }
- l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
- l.tokens.push_back(std::make_pair(std::move(name), t));
- };
- input >> current;
- if (current == 0xFEFF || current == 0xFFFE) {
- // BOM
- } else {
- input.putback(current);
- }
- while(input >> current) {
- // First we eliminate whitespace, when possible- including comments.
- // Then we define predicates to recognize tokens taking various forms.
- // Then they're called.
- // Then if none of them find anything, we put the character down as "identifier" in the stack and move on.
- // Whitespace
- if (current == L' ' || current == L'\n' || current == L'\t' || current == L'\r' ) {
- next_no_token();
- continue;
- }
- // Comment - also div and div-equals
- if (current == L'/') {
- // Check for comments first- we'll look ahead
- input >> current;
- if (current == L'/') {
- // single-line comment
- // explicitly empty while loop
- while(input >> current && current != L'\n');
- next_no_token();
- continue;
- }
- if (current == L'*') {
- // multi-line comment
- while(true) {
- // Wait until we find a terminator.
- while(input >> current && current != L'*');
- input >> current;
- if (current == L'/') {
- break;
- }
- // If we found a * for some other reason, like commented out code, then
- // just keep going again looking for one.
- }
- next_no_token();
- continue;
- }
- // If we weren't a comment, check for /=
- if (current == L'=') {
- next(L"/=", LexedFile::Token::DivAssign);
- continue;
- }
- // We don't have any more dual-character tokens to check for
- next(L"/", LexedFile::Token::Div);
- input.putback(current); // Put back the look-ahead token
- continue;
- }
- {
- // Check for the operators that only exist on their own, and as assignment versions
- // "double_check"
- // ! !=
- // ~ ~=
- // % %=
- // * *=
- auto check = [&](wchar_t token, LexedFile::Token original, LexedFile::Token original_assign) -> bool {
- if (current == token) {
- // Look-ahead
- input >> current;
- if (current == L'=') {
- std::wstring name;
- name += token;
- name += L"=";
- next(std::move(name), original_assign);
- return true;
- }
- std::wstring name;
- name += token;
- input.putback(current);
- next(std::move(name), original);
- return true;
- }
- return false;
- };
- // Grab *, *=, %, %=, =, ==, !, !=, ~, ~=
- if (check(L'*', LexedFile::Token::Mul, LexedFile::Token::MulAssign)) continue;
- if (check(L'%', LexedFile::Token::Mod, LexedFile::Token::ModAssign)) continue;
- if (check(L'=', LexedFile::Token::Assign, LexedFile::Token::EqualComparison)) continue;
- if (check(L'!', LexedFile::Token::LogicalNot, LexedFile::Token::NotEqualComparison)) continue;
- if (check(L'~', LexedFile::Token::NOT, LexedFile::Token::NOTAssign)) continue;
- if (check(L'^', LexedFile::Token::XOR, LexedFile::Token::XORAssign)) continue;
- }
- {
- // For tokens that take the form (for example)
- // + ++ +=
- // - -- -=
- // | || |=
- // & && &=
- auto triple_check = [&](
- wchar_t token,
- LexedFile::Token original,
- LexedFile::Token original_original,
- LexedFile::Token original_equals) -> bool
- {
- if (current == token) {
- input >> current;
- if (current == token) {
- std::wstring name;
- name += token;
- name += token;
- next(std::move(name), original_original);
- return true;
- }
- if (current == L'=') {
- std::wstring name;
- name += token;
- name += L'=';
- next(std::move(name), original_equals);
- return true;
- }
- input.putback(current);
- std::wstring name;
- name += token;
- next(std::move(name), original);
- return true;
- }
- return false;
- };
- // Triple group: +, ++, +=, -, --, -=, |, ||, |=, &, &&, &=
- if (triple_check(
- L'+',
- LexedFile::Token::Plus,
- LexedFile::Token::PlusPlus,
- LexedFile::Token::PlusAssign
- )) continue;
- // Pointer member access operator
- // Handle the special case, then just move on
- if (current == L'-') {
- input >> current;
- if (current == L'>') {
- next(L"->", LexedFile::Token::PointerMemberAccess);
- continue;
- }
- input.putback(current);
- current = L'-';
- }
- if (triple_check(
- L'-',
- LexedFile::Token::Sub,
- LexedFile::Token::MinusMinus,
- LexedFile::Token::SubAssign
- )) continue;
- if (triple_check(
- L'|',
- LexedFile::Token::OR,
- LexedFile::Token::LogicalOr,
- LexedFile::Token::ORAssign
- )) continue;
- if (triple_check(
- L'&',
- LexedFile::Token::AND,
- LexedFile::Token::LogicalAnd,
- LexedFile::Token::ANDAssign
- )) continue;
- }
- {
- // Written to handle <, <<, <=, <<= and >, >>, >=, >>=
- auto quadruple_check = [&](
- wchar_t token,
- LexedFile::Token original,
- LexedFile::Token original_original,
- LexedFile::Token original_equals,
- LexedFile::Token original_original_equals) -> bool
- {
- if (current == token) {
- input >> current;
- if (current == L'=') {
- std::wstring name;
- name += token;
- name += L'=';
- next(std::move(name), original_equals);
- return true;
- }
- if (current == token) {
- // We can put this back, and then just "check" for bitwise, which we know will succeed.
- // Look-ahead
- input >> current;
- if (current == L'=') {
- std::wstring name;
- name += token;
- name += token;
- name += L'=';
- next(std::move(name), original_original_equals);
- return true;
- }
- input.putback(current);
- std::wstring name;
- name += token;
- name += token;
- next(std::move(name), original_original);
- return true;
- }
- input.putback(current);
- std::wstring name;
- name += token;
- next(std::move(name), original);
- return true;
- }
- return false;
- };
- // Pretty much only <<=, <<, <=, < and >>=, >>, >=, > fit into the quadruple group.
- if (quadruple_check(
- L'<',
- LexedFile::Token::LessThan,
- LexedFile::Token::LeftShift,
- LexedFile::Token::LessThanOrEqual,
- LexedFile::Token::LeftShiftAssign
- )) continue;
- if (quadruple_check(
- L'>',
- LexedFile::Token::GreaterThan,
- LexedFile::Token::RightShift,
- LexedFile::Token::GreaterThanOrEqual,
- LexedFile::Token::RightShiftAssign
- )) continue;
- }
- {
- // That's everything. Now on to the other syntactic elements
- // Elements of just one character
- auto syntactic = [&](wchar_t token, LexedFile::Token original) -> bool {
- if (current == token) {
- std::wstring name;
- name += token;
- next(std::move(name), original);
- return true;
- }
- return false;
- };
- if (syntactic(L'(', LexedFile::Token::OpenParen)) continue;
- if (syntactic(L')', LexedFile::Token::CloseParen)) continue;
- if (syntactic(L'[', LexedFile::Token::OpenSquare)) continue;
- if (syntactic(L']', LexedFile::Token::CloseSquare)) continue;
- if (syntactic(L':', LexedFile::Token::Colon)) continue;
- if (syntactic(L',', LexedFile::Token::Comma)) continue;
- if (syntactic(L';', LexedFile::Token::Semicolon)) continue;
- if (syntactic(L'}', LexedFile::Token::CloseCurlyParen)) continue;
- if (syntactic(L'{', LexedFile::Token::OpenCurlyParen)) continue;
- }
- // Just dot, ellipses left, and then literal integers, floats, etc
- if (current == L'.') {
- input >> current;
- if (current == L'.') {
- // The only possible thing that can be here is another dot anyways
- input >> current;
- if (current == L'.') {
- next(L"...", LexedFile::Token::Ellipses);
- continue;
- }
- // Double dot illegal!
- throw std::runtime_error("Lexer failure: encountered '..'");
- }
- input.putback(current);
- next(L".", LexedFile::Token::MemberAccess);
- continue;
- }
- // Literals
- // TODO: Send the actual literal data with the lexeme, not just the type
- // If the stack is empty and we are a numeric character, then we begin a numeric literal.
- if (stack.empty() && current >= L'0' && current <= L'9') {
- // Consume until no more decimals
- while(input >> current && current >= L'0' && current <= L'9');
- // If the next character is a dot, we're a floating-point literal
- if (current == L'.') {
- // Consume all the characters that make up the float
- while(input >> current && current >= L'0' && current <= L'9');
- next(L"float", LexedFile::Token::Float);
- // Make "current" available for the next read as it's not part of the literal
- input.putback(current);
- continue;
- }
- next(L"integral", LexedFile::Token::Integral);
- input.putback(current);
- continue;
- }
- // Character literal
- if (current == L'\'') {
- // Character literal
- if (input >> current && current != L'\\') {
- // The next character must be a ' to be well-formed
- if (input >> current && current != L'\'') {
- throw std::runtime_error("Malformed character literal");
- }
- // Well-formed character lit
- next(L"char", LexedFile::Token::Character);
- continue;
- }
- // It's a slash. Grab the next character.
- input >> current;
- if (current != L'n' || current != L't' || current != L'r') { // not exhaustive atm
- throw std::runtime_error("Malformed character literal");
- }
- // And the remainder should be the '
- if (input >> current && current != L'\'') {
- throw std::runtime_error("Malformed character literal");
- }
- next(L"char", LexedFile::Token::Character);
- continue;
- }
- if (current == L'"') {
- while(true) {
- while(input >> current && current != L'"');
- if (!input)
- throw std::runtime_error("Non-terminated string literal");
- if (input.previous() != L'\\') {
- break;
- }
- }
- next(L"string", LexedFile::Token::String);
- continue;
- }
- // Not recognized, push and go again, and we don't want next() in this case.
- stack.push_back(current);
- }
- return l;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement