Advertisement
Guest User

Untitled

a guest
Sep 26th, 2017
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.03 KB | None | 0 0
  1.  
  2. class InputFile {
  3. std::wstring filename;
  4. public:
  5. InputFile(const std::wstring& argfilename)
  6. : filename(argfilename) {}
  7. LexedFile Lex() {
  8. LexedFile l;
  9.  
  10. auto FileHandle = CreateFile(
  11. filename.c_str(),
  12. GENERIC_READ,
  13. FILE_SHARE_READ,
  14. nullptr,
  15. OPEN_EXISTING,
  16. FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN,
  17. 0
  18. );
  19. std::vector<wchar_t> buffer;
  20. LARGE_INTEGER size;
  21. GetFileSizeEx(FileHandle, &size);
  22. buffer.resize(size.QuadPart);
  23. DWORD read;
  24. ReadFile(
  25. FileHandle,
  26. &buffer[0],
  27. size.QuadPart,
  28. &read,
  29. nullptr
  30. );
  31. CloseHandle(FileHandle);
  32. StreamInput input(&buffer);
  33.  
  34. //std::wifstream input(filename, std::ios::in);
  35. std::vector<wchar_t> stack;
  36. wchar_t current = 0;
  37. static const std::unordered_map<std::wstring, LexedFile::Token> reserved_words(
  38. []() -> std::unordered_map<std::wstring, LexedFile::Token> {
  39. std::unordered_map<std::wstring, LexedFile::Token> retval;
  40. retval[L"namespace"] = LexedFile::Token::Namespace;
  41. retval[L"for"] = LexedFile::Token::For;
  42. retval[L"while"] = LexedFile::Token::While;
  43. retval[L"do"] = LexedFile::Token::Do;
  44. retval[L"switch"] = LexedFile::Token::Switch;
  45. retval[L"case"] = LexedFile::Token::Case;
  46. retval[L"default"] = LexedFile::Token::Default;
  47. retval[L"try"] = LexedFile::Token::Try;
  48. retval[L"catch"] = LexedFile::Token::Catch;
  49. retval[L"auto"] = LexedFile::Token::Auto;
  50. retval[L"type"] = LexedFile::Token::Type;
  51. retval[L"break"] = LexedFile::Token::Break;
  52. retval[L"continue"] = LexedFile::Token::Continue;
  53. retval[L"return"] = LexedFile::Token::Return;
  54. retval[L"static"] = LexedFile::Token::Static;
  55. retval[L"sizeof"] = LexedFile::Token::Sizeof;
  56. retval[L"decltype"] = LexedFile::Token::Decltype;
  57. retval[L"if"] = LexedFile::Token::If;
  58. retval[L"else"] = LexedFile::Token::Else;
  59. return retval;
  60. }()
  61. );
  62.  
  63. // Dumps the stack, looking for reserved words or identifiers.
  64. auto next_no_token = [&] {
  65. if (stack.empty()) {
  66. return;
  67. }
  68. std::wstring token(stack.begin(), stack.end());
  69. stack.clear();
  70. if (reserved_words.find(token) != reserved_words.end()) {
  71. l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
  72. return;
  73. }
  74. std::wregex regex(L"^[_A-Za-z]\\w*$");
  75. if (!std::regex_match(token, regex)) {
  76. __debugbreak();
  77. throw std::runtime_error("Malformed identifier");
  78. }
  79. l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
  80. };
  81. auto next = [&](std::wstring name, Wide::LexedFile::Token t) {
  82. if (stack.empty()) {
  83. l.tokens.push_back(std::make_pair(std::move(name), t));
  84. return;
  85. }
  86. std::wstring token(stack.begin(), stack.end());
  87. stack.clear();
  88. if (reserved_words.find(token) != reserved_words.end()) {
  89. l.tokens.push_back(std::make_pair(reserved_words.find(token)->first, reserved_words.find(token)->second));
  90. return;
  91. }
  92. std::wregex regex(L"^[_A-Za-z]\\w*$");
  93. if (!std::regex_match(token, regex)) {
  94. __debugbreak();
  95. throw std::runtime_error("Malformed identifier");
  96. }
  97. l.tokens.push_back(std::make_pair(token, LexedFile::Token::Identifier));
  98. l.tokens.push_back(std::make_pair(std::move(name), t));
  99. };
  100. input >> current;
  101. if (current == 0xFEFF || current == 0xFFFE) {
  102. // BOM
  103. } else {
  104. input.putback(current);
  105. }
  106. while(input >> current) {
  107. // First we eliminate whitespace, when possible- including comments.
  108. // Then we define predicates to recognize tokens taking various forms.
  109. // Then they're called.
  110. // Then if none of them find anything, we put the character down as "identifier" in the stack and move on.
  111.  
  112. // Whitespace
  113. if (current == L' ' || current == L'\n' || current == L'\t' || current == L'\r' ) {
  114. next_no_token();
  115. continue;
  116. }
  117.  
  118. // Comment - also div and div-equals
  119. if (current == L'/') {
  120. // Check for comments first- we'll look ahead
  121. input >> current;
  122. if (current == L'/') {
  123. // single-line comment
  124. // explicitly empty while loop
  125. while(input >> current && current != L'\n');
  126. next_no_token();
  127. continue;
  128. }
  129. if (current == L'*') {
  130. // multi-line comment
  131. while(true) {
  132. // Wait until we find a terminator.
  133. while(input >> current && current != L'*');
  134. input >> current;
  135. if (current == L'/') {
  136. break;
  137. }
  138. // If we found a * for some other reason, like commented out code, then
  139. // just keep going again looking for one.
  140. }
  141. next_no_token();
  142. continue;
  143. }
  144.  
  145. // If we weren't a comment, check for /=
  146. if (current == L'=') {
  147. next(L"/=", LexedFile::Token::DivAssign);
  148. continue;
  149. }
  150.  
  151. // We don't have any more dual-character tokens to check for
  152. next(L"/", LexedFile::Token::Div);
  153. input.putback(current); // Put back the look-ahead token
  154. continue;
  155. }
  156.  
  157. {
  158. // Check for the operators that only exist on their own, and as assignment versions
  159. // "double_check"
  160. // ! !=
  161. // ~ ~=
  162. // % %=
  163. // * *=
  164. auto check = [&](wchar_t token, LexedFile::Token original, LexedFile::Token original_assign) -> bool {
  165. if (current == token) {
  166. // Look-ahead
  167. input >> current;
  168. if (current == L'=') {
  169. std::wstring name;
  170. name += token;
  171. name += L"=";
  172. next(std::move(name), original_assign);
  173. return true;
  174. }
  175. std::wstring name;
  176. name += token;
  177. input.putback(current);
  178. next(std::move(name), original);
  179. return true;
  180. }
  181. return false;
  182. };
  183. // Grab *, *=, %, %=, =, ==, !, !=, ~, ~=
  184. if (check(L'*', LexedFile::Token::Mul, LexedFile::Token::MulAssign)) continue;
  185. if (check(L'%', LexedFile::Token::Mod, LexedFile::Token::ModAssign)) continue;
  186. if (check(L'=', LexedFile::Token::Assign, LexedFile::Token::EqualComparison)) continue;
  187. if (check(L'!', LexedFile::Token::LogicalNot, LexedFile::Token::NotEqualComparison)) continue;
  188. if (check(L'~', LexedFile::Token::NOT, LexedFile::Token::NOTAssign)) continue;
  189. if (check(L'^', LexedFile::Token::XOR, LexedFile::Token::XORAssign)) continue;
  190. }
  191.  
  192. {
  193. // For tokens that take the form (for example)
  194. // + ++ +=
  195. // - -- -=
  196. // | || |=
  197. // & && &=
  198. auto triple_check = [&](
  199. wchar_t token,
  200. LexedFile::Token original,
  201. LexedFile::Token original_original,
  202. LexedFile::Token original_equals) -> bool
  203. {
  204. if (current == token) {
  205. input >> current;
  206. if (current == token) {
  207. std::wstring name;
  208. name += token;
  209. name += token;
  210. next(std::move(name), original_original);
  211. return true;
  212. }
  213. if (current == L'=') {
  214. std::wstring name;
  215. name += token;
  216. name += L'=';
  217. next(std::move(name), original_equals);
  218. return true;
  219. }
  220. input.putback(current);
  221. std::wstring name;
  222. name += token;
  223. next(std::move(name), original);
  224. return true;
  225. }
  226. return false;
  227. };
  228. // Triple group: +, ++, +=, -, --, -=, |, ||, |=, &, &&, &=
  229. if (triple_check(
  230. L'+',
  231. LexedFile::Token::Plus,
  232. LexedFile::Token::PlusPlus,
  233. LexedFile::Token::PlusAssign
  234. )) continue;
  235.  
  236. // Pointer member access operator
  237. // Handle the special case, then just move on
  238. if (current == L'-') {
  239. input >> current;
  240. if (current == L'>') {
  241. next(L"->", LexedFile::Token::PointerMemberAccess);
  242. continue;
  243. }
  244. input.putback(current);
  245. current = L'-';
  246. }
  247. if (triple_check(
  248. L'-',
  249. LexedFile::Token::Sub,
  250. LexedFile::Token::MinusMinus,
  251. LexedFile::Token::SubAssign
  252. )) continue;
  253.  
  254. if (triple_check(
  255. L'|',
  256. LexedFile::Token::OR,
  257. LexedFile::Token::LogicalOr,
  258. LexedFile::Token::ORAssign
  259. )) continue;
  260. if (triple_check(
  261. L'&',
  262. LexedFile::Token::AND,
  263. LexedFile::Token::LogicalAnd,
  264. LexedFile::Token::ANDAssign
  265. )) continue;
  266. }
  267.  
  268. {
  269. // Written to handle <, <<, <=, <<= and >, >>, >=, >>=
  270. auto quadruple_check = [&](
  271. wchar_t token,
  272. LexedFile::Token original,
  273. LexedFile::Token original_original,
  274. LexedFile::Token original_equals,
  275. LexedFile::Token original_original_equals) -> bool
  276. {
  277. if (current == token) {
  278. input >> current;
  279. if (current == L'=') {
  280. std::wstring name;
  281. name += token;
  282. name += L'=';
  283. next(std::move(name), original_equals);
  284. return true;
  285. }
  286. if (current == token) {
  287. // We can put this back, and then just "check" for bitwise, which we know will succeed.
  288. // Look-ahead
  289. input >> current;
  290. if (current == L'=') {
  291. std::wstring name;
  292. name += token;
  293. name += token;
  294. name += L'=';
  295. next(std::move(name), original_original_equals);
  296. return true;
  297. }
  298. input.putback(current);
  299. std::wstring name;
  300. name += token;
  301. name += token;
  302. next(std::move(name), original_original);
  303. return true;
  304. }
  305. input.putback(current);
  306. std::wstring name;
  307. name += token;
  308. next(std::move(name), original);
  309. return true;
  310. }
  311. return false;
  312. };
  313. // Pretty much only <<=, <<, <=, < and >>=, >>, >=, > fit into the quadruple group.
  314. if (quadruple_check(
  315. L'<',
  316. LexedFile::Token::LessThan,
  317. LexedFile::Token::LeftShift,
  318. LexedFile::Token::LessThanOrEqual,
  319. LexedFile::Token::LeftShiftAssign
  320. )) continue;
  321. if (quadruple_check(
  322. L'>',
  323. LexedFile::Token::GreaterThan,
  324. LexedFile::Token::RightShift,
  325. LexedFile::Token::GreaterThanOrEqual,
  326. LexedFile::Token::RightShiftAssign
  327. )) continue;
  328. }
  329.  
  330. {
  331. // That's everything. Now on to the other syntactic elements
  332. // Elements of just one character
  333. auto syntactic = [&](wchar_t token, LexedFile::Token original) -> bool {
  334. if (current == token) {
  335. std::wstring name;
  336. name += token;
  337. next(std::move(name), original);
  338. return true;
  339. }
  340. return false;
  341. };
  342.  
  343. if (syntactic(L'(', LexedFile::Token::OpenParen)) continue;
  344. if (syntactic(L')', LexedFile::Token::CloseParen)) continue;
  345. if (syntactic(L'[', LexedFile::Token::OpenSquare)) continue;
  346. if (syntactic(L']', LexedFile::Token::CloseSquare)) continue;
  347. if (syntactic(L':', LexedFile::Token::Colon)) continue;
  348. if (syntactic(L',', LexedFile::Token::Comma)) continue;
  349. if (syntactic(L';', LexedFile::Token::Semicolon)) continue;
  350. if (syntactic(L'}', LexedFile::Token::CloseCurlyParen)) continue;
  351. if (syntactic(L'{', LexedFile::Token::OpenCurlyParen)) continue;
  352. }
  353.  
  354. // Just dot, ellipses left, and then literal integers, floats, etc
  355. if (current == L'.') {
  356. input >> current;
  357. if (current == L'.') {
  358. // The only possible thing that can be here is another dot anyways
  359. input >> current;
  360. if (current == L'.') {
  361. next(L"...", LexedFile::Token::Ellipses);
  362. continue;
  363. }
  364. // Double dot illegal!
  365. throw std::runtime_error("Lexer failure: encountered '..'");
  366. }
  367. input.putback(current);
  368. next(L".", LexedFile::Token::MemberAccess);
  369. continue;
  370. }
  371.  
  372. // Literals
  373. // TODO: Send the actual literal data with the lexeme, not just the type
  374. // If the stack is empty and we are a numeric character, then we begin a numeric literal.
  375. if (stack.empty() && current >= L'0' && current <= L'9') {
  376. // Consume until no more decimals
  377. while(input >> current && current >= L'0' && current <= L'9');
  378. // If the next character is a dot, we're a floating-point literal
  379. if (current == L'.') {
  380. // Consume all the characters that make up the float
  381. while(input >> current && current >= L'0' && current <= L'9');
  382. next(L"float", LexedFile::Token::Float);
  383. // Make "current" available for the next read as it's not part of the literal
  384. input.putback(current);
  385. continue;
  386. }
  387. next(L"integral", LexedFile::Token::Integral);
  388. input.putback(current);
  389. continue;
  390. }
  391.  
  392. // Character literal
  393. if (current == L'\'') {
  394. // Character literal
  395. if (input >> current && current != L'\\') {
  396. // The next character must be a ' to be well-formed
  397. if (input >> current && current != L'\'') {
  398. throw std::runtime_error("Malformed character literal");
  399. }
  400. // Well-formed character lit
  401. next(L"char", LexedFile::Token::Character);
  402. continue;
  403. }
  404. // It's a slash. Grab the next character.
  405. input >> current;
  406. if (current != L'n' || current != L't' || current != L'r') { // not exhaustive atm
  407. throw std::runtime_error("Malformed character literal");
  408. }
  409. // And the remainder should be the '
  410. if (input >> current && current != L'\'') {
  411. throw std::runtime_error("Malformed character literal");
  412. }
  413. next(L"char", LexedFile::Token::Character);
  414. continue;
  415. }
  416.  
  417. if (current == L'"') {
  418. while(true) {
  419. while(input >> current && current != L'"');
  420. if (!input)
  421. throw std::runtime_error("Non-terminated string literal");
  422. if (input.previous() != L'\\') {
  423. break;
  424. }
  425. }
  426. next(L"string", LexedFile::Token::String);
  427. continue;
  428. }
  429.  
  430. // Not recognized, push and go again, and we don't want next() in this case.
  431. stack.push_back(current);
  432. }
  433. return l;
  434. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement