Untitled


	class InputFile {
		std::wstring filename;
	public:
		InputFile(const std::wstring& argfilename)
		: filename(argfilename) {}
		LexedFile Lex() {
			LexedFile l;

			auto FileHandle = CreateFile(
				filename.c_str(),
				GENERIC_READ,
				FILE_SHARE_READ,
				nullptr,
				OPEN_EXISTING,
				FILE_ATTRIBUTE_NORMAL |	FILE_FLAG_SEQUENTIAL_SCAN,
				0
			);
			std::vector<wchar_t> buffer;
			LARGE_INTEGER size;
			GetFileSizeEx(FileHandle, &size);
			buffer.resize(size.QuadPart);
			DWORD read;
			ReadFile(
				FileHandle,
				&buffer[0],
				size.QuadPart,
				&read,
				nullptr
			);
			CloseHandle(FileHandle);
			StreamInput input(&buffer);

			//std::wifstream input(filename, std::ios::in);
			std::vector<wchar_t> stack;
			wchar_t current = 0;
			static const std::unordered_map<std::wstring, LexedFile::Token> reserved_words(
			[]() -> std::unordered_map<std::wstring, LexedFile::Token> {
					std::unordered_map<std::wstring, LexedFile::Token> retval;
					retval[L"namespace"] = LexedFile::Token::Namespace;
					retval[L"for"] = LexedFile::Token::For;
					retval[L"while"] = LexedFile::Token::While;
					retval[L"do"] = LexedFile::Token::Do;
					retval[L"switch"] = LexedFile::Token::Switch;
					retval[L"case"] = LexedFile::Token::Case;
					retval[L"default"] = LexedFile::Token::Default;
					retval[L"try"] = LexedFile::Token::Try;
					retval[L"catch"] = LexedFile::Token::Catch;
					retval[L"auto"] = LexedFile::Token::Auto;
					retval[L"type"] = LexedFile::Token::Type;
					retval[L"break"] = LexedFile::Token::Break;
					retval[L"continue"] = LexedFile::Token::Continue;
					retval[L"return"] = LexedFile::Token::Return;
					retval[L"static"] = LexedFile::Token::Static;
					retval[L"sizeof"] = LexedFile::Token::Sizeof;
					retval[L"decltype"] = LexedFile::Token::Decltype;
					retval[L"if"] = LexedFile::Token::If;
					retval[L"else"] = LexedFile::Token::Else;
					return retval;
			    }()
			);

			// Dumps the stack, looking for reserved words or identifiers.
			auto next_no_token = [&] {
				if (stack.empty()) {
					return;
				}
				std::wstring token(stack.begin(), stack.end());
				stack.clear();
				if (reserved_words.find(token) != reserved_words.end()) {
					l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
					return;
				}
				std::wregex regex(L"^[_A-Za-z]\\w*$");
				if (!std::regex_match(token, regex)) {
					__debugbreak();
					throw std::runtime_error("Malformed identifier");
				}
				l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
			};
			auto next = [&](std::wstring name, Wide::LexedFile::Token t) {
				if (stack.empty()) {
					l.tokens.push_back(std::make_pair(std::move(name), t));
					return;
				}
				std::wstring token(stack.begin(), stack.end());
				stack.clear();
				if (reserved_words.find(token) != reserved_words.end()) {
					l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
					return;
				}
				std::wregex regex(L"^[_A-Za-z]\\w*$");
				if (!std::regex_match(token, regex)) {
					__debugbreak();
					throw std::runtime_error("Malformed identifier");
				}
				l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
				l.tokens.push_back(std::make_pair(std::move(name), t));
			};
			input >> current;
			if (current == 0xFEFF || current == 0xFFFE) {
				// BOM
			} else {
				input.putback(current);
			}
			while(input >> current) {
				// First we eliminate whitespace, when possible- including comments.
				// Then we define predicates to recognize tokens taking various forms.
				// Then they're called.
				// Then if none of them find anything, we put the character down as "identifier" in the stack and move on.

				// Whitespace
				if (current == L' ' || current == L'\n' || current == L'\t' || current == L'\r' ) {
					next_no_token();
					continue;
				}

				// Comment - also div and div-equals
				if (current == L'/') {
					// Check for comments first- we'll look ahead
					input >> current;
					if (current == L'/') {
						// single-line comment
						// explicitly empty while loop
						while(input >> current && current != L'\n');
						next_no_token();
						continue;
					}
					if (current == L'*') {
						// multi-line comment
						while(true) {
							// Wait until we find a terminator.
							while(input >> current && current != L'*');
							input >> current;
							if (current == L'/') {
								break;
							}
							// If we found a * for some other reason, like commented out code, then
							// just keep going again looking for one.
						}
						next_no_token();
						continue;
					}

					// If we weren't a comment, check for /=
					if (current == L'=') {
						next(L"/=", LexedFile::Token::DivAssign);
						continue;
					}

					// We don't have any more dual-character tokens to check for
					next(L"/", LexedFile::Token::Div);
					input.putback(current); // Put back the look-ahead token
					continue;
				}

				{
					// Check for the operators that only exist on their own, and as assignment versions
					// "double_check"
					// ! !=
					// ~ ~=
					// % %=
					// * *=
					auto check = [&](wchar_t token, LexedFile::Token original, LexedFile::Token original_assign) -> bool {
						if (current == token) {
							// Look-ahead
							input >> current;
							if (current == L'=') {
								std::wstring name;
								name += token;
								name += L"=";
								next(std::move(name), original_assign);
								return true;
							}
							std::wstring name;
							name += token;
							input.putback(current);
							next(std::move(name), original);
							return true;
						}
						return false;
					};
					// Grab *, *=, %, %=, =, ==, !, !=, ~, ~=
					if (check(L'*', LexedFile::Token::Mul, LexedFile::Token::MulAssign)) continue;
					if (check(L'%', LexedFile::Token::Mod, LexedFile::Token::ModAssign)) continue;
					if (check(L'=', LexedFile::Token::Assign, LexedFile::Token::EqualComparison)) continue;
					if (check(L'!', LexedFile::Token::LogicalNot, LexedFile::Token::NotEqualComparison)) continue;
					if (check(L'~', LexedFile::Token::NOT, LexedFile::Token::NOTAssign)) continue;
					if (check(L'^', LexedFile::Token::XOR, LexedFile::Token::XORAssign)) continue;
				}

				{
					// For tokens that take the form (for example)
					// + ++ +=
					// - -- -=
					// | || |=
					// & && &=
					auto triple_check = [&](
						wchar_t token,
						LexedFile::Token original,
						LexedFile::Token original_original,
						LexedFile::Token original_equals) -> bool
					{
						if (current == token) {
							input >> current;
							if (current == token) {
								std::wstring name;
								name += token;
								name += token;
								next(std::move(name), original_original);
								return true;
							}
							if (current == L'=') {
								std::wstring name;
								name += token;
								name += L'=';
								next(std::move(name), original_equals);
								return true;
							}
							input.putback(current);
							std::wstring name;
							name += token;
							next(std::move(name), original);
							return true;
						}
						return false;
					};
					// Triple group: +, ++, +=, -, --, -=, |, ||, |=, &, &&, &=
					if (triple_check(
						L'+',
						LexedFile::Token::Plus,
						LexedFile::Token::PlusPlus,
						LexedFile::Token::PlusAssign
					)) continue;

					// Pointer member access operator
					// Handle the special case, then just move on
					if (current == L'-') {
						input >> current;
						if (current == L'>') {
							next(L"->", LexedFile::Token::PointerMemberAccess);
							continue;
						}
						input.putback(current);
						current = L'-';
					}
					if (triple_check(
						L'-',
						LexedFile::Token::Sub,
						LexedFile::Token::MinusMinus,
						LexedFile::Token::SubAssign
					)) continue;

					if (triple_check(
						L'|',
						LexedFile::Token::OR,
						LexedFile::Token::LogicalOr,
						LexedFile::Token::ORAssign
					)) continue;
					if (triple_check(
						L'&',
						LexedFile::Token::AND,
						LexedFile::Token::LogicalAnd,
						LexedFile::Token::ANDAssign
					)) continue;
				}

				{
					// Written to handle <, <<, <=, <<= and >, >>, >=, >>=
					auto quadruple_check = [&](
						wchar_t token,
						LexedFile::Token original,
						LexedFile::Token original_original,
						LexedFile::Token original_equals,
						LexedFile::Token original_original_equals) -> bool
					{
						if (current == token) {
							input >> current;
							if (current == L'=') {
								std::wstring name;
								name += token;
								name += L'=';
								next(std::move(name), original_equals);
								return true;
							}
							if (current == token) {
								// We can put this back, and then just "check" for bitwise, which we know will succeed.
								// Look-ahead
								input >> current;
								if (current == L'=') {
									std::wstring name;
									name += token;
									name += token;
									name += L'=';
									next(std::move(name), original_original_equals);
									return true;
								}
								input.putback(current);
								std::wstring name;
								name += token;
								name += token;
								next(std::move(name), original_original);
								return true;
							}
							input.putback(current);
							std::wstring name;
							name += token;
							next(std::move(name), original);
							return true;
						}
						return false;
					};
					// Pretty much only <<=, <<, <=, < and >>=, >>, >=, > fit into the quadruple group.
					if (quadruple_check(
						L'<',
						LexedFile::Token::LessThan,
						LexedFile::Token::LeftShift,
						LexedFile::Token::LessThanOrEqual,
						LexedFile::Token::LeftShiftAssign
					)) continue;
					if (quadruple_check(
						L'>',
						LexedFile::Token::GreaterThan,
						LexedFile::Token::RightShift,
						LexedFile::Token::GreaterThanOrEqual,
						LexedFile::Token::RightShiftAssign
					)) continue;
				}

				{
					// That's everything. Now on to the other syntactic elements
					// Elements of just one character
					auto syntactic = [&](wchar_t token, LexedFile::Token original) -> bool {
						if (current == token) {
							std::wstring name;
							name += token;
							next(std::move(name), original);
							return true;
						}
						return false;
					};

					if (syntactic(L'(', LexedFile::Token::OpenParen)) continue;
					if (syntactic(L')', LexedFile::Token::CloseParen)) continue;
					if (syntactic(L'[', LexedFile::Token::OpenSquare)) continue;
					if (syntactic(L']', LexedFile::Token::CloseSquare)) continue;
					if (syntactic(L':', LexedFile::Token::Colon)) continue;
					if (syntactic(L',', LexedFile::Token::Comma)) continue;
					if (syntactic(L';', LexedFile::Token::Semicolon)) continue;
					if (syntactic(L'}', LexedFile::Token::CloseCurlyParen)) continue;
					if (syntactic(L'{', LexedFile::Token::OpenCurlyParen)) continue;
				}

				// Just dot, ellipses left, and then literal integers, floats, etc
				if (current == L'.') {
					input >> current;
					if (current == L'.') {
						// The only possible thing that can be here is another dot anyways
						input >> current;
						if (current == L'.') {
							next(L"...", LexedFile::Token::Ellipses);
							continue;
						}
						// Double dot illegal!
						throw std::runtime_error("Lexer failure: encountered '..'");
					}
					input.putback(current);
					next(L".", LexedFile::Token::MemberAccess);
					continue;
				}

				// Literals
				// TODO: Send the actual literal data with the lexeme, not just the type
				// If the stack is empty and we are a numeric character, then we begin a numeric literal.
				if (stack.empty() && current >= L'0' && current <= L'9') {
					// Consume until no more decimals
			        while(input >> current && current >= L'0' && current <= L'9');
					// If the next character is a dot, we're a floating-point literal
					if (current == L'.') {
						// Consume all the characters that make up the float
						while(input >> current && current >= L'0' && current <= L'9');
						next(L"float", LexedFile::Token::Float);
						// Make "current" available for the next read as it's not part of the literal
						input.putback(current);
						continue;
					}
					next(L"integral", LexedFile::Token::Integral);
					input.putback(current);
					continue;
				}

				// Character literal
				if (current == L'\'') {
					// Character literal
					if (input >> current && current != L'\\') {
						// The next character must be a ' to be well-formed
						if (input >> current && current != L'\'') {
							throw std::runtime_error("Malformed character literal");
						}
						// Well-formed character lit
						next(L"char", LexedFile::Token::Character);
						continue;
					}
					// It's a slash. Grab the next character.
					input >> current;
					if (current != L'n' || current != L't' || current != L'r') { // not exhaustive atm
						throw std::runtime_error("Malformed character literal");
					}
					// And the remainder should be the '
					if (input >> current && current != L'\'') {
						throw std::runtime_error("Malformed character literal");
					}
					next(L"char", LexedFile::Token::Character);
					continue;
				}

                if (current == L'"') {
                    while(true) {
                    	while(input >> current && current != L'"');
                    	if (!input)
                    		throw std::runtime_error("Non-terminated string literal");
                    	if (input.previous() != L'\\') {
                    		break;
                    	}
                    }
                    next(L"string", LexedFile::Token::String);
                    continue;
				}

				// Not recognized, push and go again, and we don't want next() in this case.
				stack.push_back(current);
			}
			return l;
		}