Advertisement
Guest User

Untitled

a guest
Jun 18th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.32 KB | None | 0 0
  1. #include "Lexer.hpp"
  2.  
  3. namespace sage
  4. {
  5. namespace detail
  6. {
  7. // ----------------------------------------------------------------------------
  8. inline char getCharAndAdvance(const char*& ptr)
  9. {
  10. return *ptr++;
  11. }
  12. // ----------------------------------------------------------------------------
  13. inline bool isHorizontalWhitespace(char c)
  14. {
  15. return c == ' ' || c == '\t' || c == '\f' || c == '\v';
  16. }
  17. // ----------------------------------------------------------------------------
  18. inline bool isVerticalWhitespace(char c)
  19. {
  20. return c == '\r' || c == '\n';
  21. }
  22. // ----------------------------------------------------------------------------
  23. Lexer::Lexer(std::string input)
  24. : m_input(std::move(input))
  25. , m_bufferStart(m_input.data())
  26. , m_bufferEnd(m_input.data() + m_input.size())
  27. , m_bufferPtr(m_input.data())
  28. {
  29. }
  30. // ----------------------------------------------------------------------------
  31. bool Lexer::skipWhitespace(Token& result, const char* curPtr)
  32. {
  33. auto c = *curPtr;
  34.  
  35. // Skip consecutive spaces.
  36. while(true) {
  37. if(!isHorizontalWhitespace(c) && !isVerticalWhitespace(c)) {
  38. // if we have something other than whitespace, we're done.
  39. break;
  40. }
  41.  
  42. c = *++curPtr;
  43. }
  44.  
  45. // If the client wants us to return whitespace, return it now.
  46. if(m_keepWhitespace) {
  47. formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
  48. return true;
  49. }
  50.  
  51. m_bufferPtr = curPtr;
  52. return false;
  53. }
  54. // ----------------------------------------------------------------------------
  55. bool Lexer::skipLineComment(Token& result, const char* curPtr)
  56. {
  57. // Scan over the body of the comment. The common case, when scanning, is that
  58. // the comment contains normal ascii characters with nothing interesting in
  59. // them. As such, optimize for this case with the inner loop.
  60. //
  61. // This loop terminates with CurPtr pointing at the newline (or end of buffer)
  62. // character that ends the line comment.
  63. char C;
  64. while(true) {
  65. C = *curPtr;
  66. // Skip over characters in the fast loop.
  67. while(C != 0 && // Potentially EOF.
  68. C != '\n' && C != '\r') // Newline or DOS-style newline.
  69. C = *++curPtr;
  70.  
  71. break;
  72. }
  73.  
  74. // If we are returning comments as tokens, return this comment as a token.
  75. if(m_keepComments) {
  76. formTokenWithChars(result, curPtr, Token::Kind::Comment);
  77. return true;
  78. }
  79.  
  80. // Otherwise, eat the \n character. We don't care if this is a \n\r or
  81. // \r\n sequence. This is an efficiency hack (because we know the \n can't
  82. // contribute to another token), it isn't needed for correctness. Note that
  83. // this is ok even in KeepWhitespaceMode, because we would have returned the
  84. // comment above in that mode.
  85. ++curPtr;
  86.  
  87. m_bufferPtr = curPtr;
  88. return false;
  89. }
  90. // ----------------------------------------------------------------------------
  91. bool Lexer::lexIdentifier(Token& result, const char* curPtr)
  92. {
  93. // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  94. unsigned char C = *curPtr++;
  95. while((C >= 'A' && C <= 'Z') || (C >= 'a' && C <= 'z') ||
  96. (C >= '0' && C <= '9') || C == '_')
  97. C = *curPtr++;
  98.  
  99. --curPtr; // Back up over the skipped character.
  100.  
  101. formTokenWithChars(result, curPtr, Token::Kind::Identifier);
  102.  
  103. return true;
  104. }
  105. // ----------------------------------------------------------------------------
  106. bool Lexer::lexNumericConstant(Token& result, const char* curPtr)
  107. {
  108. (void)result;
  109. (void)curPtr;
  110. throw std::exception("Not implemented yet");
  111. }
  112. // ----------------------------------------------------------------------------
  113. bool Lexer::lexStringLiteral(Token& result,
  114. const char* curPtr,
  115. bool isSingleQuoted)
  116. {
  117. auto const quote = isSingleQuoted ? '\'' : '"';
  118.  
  119. char c = getCharAndAdvance(curPtr);
  120. while(c != quote) {
  121. // Skip escaped characters.
  122. if(c == '\\')
  123. c = getCharAndAdvance(curPtr);
  124.  
  125. if(c == '\n' || c == '\r' ||
  126. (c == 0 && curPtr - 1 == m_bufferEnd)) { // End of file.
  127. formTokenWithChars(result, curPtr - 1, Token::Kind::Unknown);
  128. return true;
  129. }
  130.  
  131. c = getCharAndAdvance(curPtr);
  132. }
  133.  
  134. formTokenWithChars(result, curPtr, Token::Kind::StringLiteral);
  135. return true;
  136. }
  137. // ----------------------------------------------------------------------------
  138. void Lexer::formTokenWithChars(Token& result,
  139. const char* tokEnd,
  140. Token::Kind kind)
  141. {
  142. auto const tokLen = static_cast<uint32_t>(tokEnd - m_bufferPtr);
  143.  
  144. result.setKind(kind);
  145. result.setLength(tokLen);
  146. result.setLocation(getSourceLocation(m_bufferPtr));
  147. result.setIdentifier(m_bufferPtr, tokLen);
  148.  
  149. m_bufferPtr = tokEnd;
  150. }
  151. SourceLocation Lexer::getSourceLocation(const char* ptr)
  152. {
  153. auto loc = SourceLocation{};
  154. auto curPtr = m_bufferStart;
  155.  
  156. loc.line = 1;
  157. while(curPtr < ptr) {
  158. if(isVerticalWhitespace(*curPtr++)) {
  159. loc.line++;
  160. loc.column = 0;
  161. } else {
  162. loc.column++;
  163. }
  164. }
  165.  
  166. return loc;
  167. }
  168. // ----------------------------------------------------------------------------
  169. bool Lexer::lexToken(Token& result)
  170. {
  171. LexNextToken:
  172. const char* curPtr = m_bufferPtr;
  173.  
  174. if(curPtr > m_bufferEnd)
  175. return false;
  176.  
  177. // Small amounts of horizontal whitespace are very common between tokens.
  178. if((*curPtr == ' ') || (*curPtr == '\t')) {
  179. ++curPtr;
  180. while((*curPtr == ' ') || (*curPtr == '\t'))
  181. ++curPtr;
  182.  
  183. if(m_keepWhitespace) {
  184. // If the user wants to keep whitespaces
  185. // form a new token and return
  186. formTokenWithChars(result, curPtr, Token::Kind::Whitespace);
  187. return true;
  188. }
  189.  
  190. m_bufferPtr = curPtr;
  191. }
  192.  
  193. auto c = getCharAndAdvance(curPtr);
  194. auto kind = Token::Kind::Unknown;
  195.  
  196. // Lex identifiers and constants
  197. if(c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
  198. return lexIdentifier(result, curPtr);
  199. if(c >= '0' && c <= '9')
  200. return lexNumericConstant(result, curPtr);
  201.  
  202. switch(c) {
  203. case '\0':
  204. // Found EOF?
  205. if(curPtr - 1 == m_bufferEnd) {
  206. formTokenWithChars(result, curPtr, Token::Kind::EndOfFile);
  207. return true;
  208. }
  209.  
  210. if(skipWhitespace(result, curPtr))
  211. return true;
  212.  
  213. goto LexNextToken;
  214. case '\r':
  215. if(*curPtr == '\n')
  216. c = getCharAndAdvance(curPtr);
  217. [[fallthrough]];
  218. case '\n':
  219. if(skipWhitespace(result, curPtr))
  220. return true;
  221.  
  222. // We only saw whitespace, so just try again
  223. goto LexNextToken;
  224. case ' ':
  225. case '\t':
  226. case '\f':
  227. case '\v':
  228. SkipHorizontalWhitespace:
  229. if(skipWhitespace(result, curPtr))
  230. return true;
  231. SkipIgnoredUnits:
  232. curPtr = m_bufferPtr;
  233. // If the next token is obviously a // or /* */ comment, skip it
  234. // efficiently too (without going through the big switch stmt).
  235. if(curPtr[0] == '/' && curPtr[1] == '/' && !m_keepComments) {
  236. if(skipLineComment(result, curPtr + 2))
  237. return true;
  238. goto SkipIgnoredUnits;
  239. } else if(isHorizontalWhitespace(*curPtr)) {
  240. goto SkipHorizontalWhitespace;
  241. }
  242. // We only saw whitespace, so just try again
  243. // (We manually eliminate the tail call to avoid recursion.)
  244. goto LexNextToken;
  245. case '\'':
  246. return lexStringLiteral(result, curPtr, true);
  247. case '"':
  248. return lexStringLiteral(result, curPtr, false);
  249. case '?':
  250. kind = Token::Kind::Question;
  251. break;
  252. case '[':
  253. kind = Token::Kind::LeftSquare;
  254. break;
  255. case ']':
  256. kind = Token::Kind::RightSquare;
  257. break;
  258. case '(':
  259. kind = Token::Kind::LeftParen;
  260. break;
  261. case ')':
  262. kind = Token::Kind::RightParen;
  263. break;
  264. case '{':
  265. kind = Token::Kind::LeftCurly;
  266. break;
  267. case '}':
  268. kind = Token::Kind::RightCurly;
  269. break;
  270. case '.': {
  271. auto const next = *curPtr;
  272. if(next >= '0' && next <= '9') {
  273. return lexNumericConstant(result, curPtr);
  274. } else {
  275. kind = Token::Kind::Period;
  276. }
  277. }
  278. case '*':
  279. kind = Token::Kind::Star;
  280. break;
  281. case '+':
  282. kind = Token::Kind::Plus;
  283. break;
  284. case '-':
  285. kind = Token::Kind::Minus;
  286. break;
  287. case '!':
  288. kind = Token::Kind::Exclam;
  289. break;
  290. case '/':
  291. if(*curPtr == '/') {
  292. if(skipLineComment(result, curPtr + 1))
  293. return true; // There is a token to return.
  294.  
  295. // It is common for the tokens immediately after a // comment to be
  296. // whitespace (indentation for the next line). Instead of going through
  297. // the big switch, handle it efficiently now.
  298. goto SkipIgnoredUnits;
  299. }
  300. kind = Token::Kind::Slash;
  301. break;
  302. case '>':
  303. kind = Token::Kind::Greater;
  304. break;
  305. case ':':
  306. kind = Token::Kind::Colon;
  307. break;
  308. case ';':
  309. kind = Token::Kind::SemiColon;
  310. break;
  311. case ',':
  312. kind = Token::Kind::Comma;
  313. break;
  314. case '#':
  315. kind = Token::Kind::Hash;
  316. break;
  317. default:
  318. kind = Token::Kind::Unknown;
  319. break;
  320. }
  321. formTokenWithChars(result, curPtr, kind);
  322. return true;
  323. }
  324. // ----------------------------------------------------------------------------
  325. std::vector<Token> Lexer::lexAllTokens()
  326. {
  327. auto result = std::vector<Token>{};
  328. auto token = Token{};
  329. while(lexToken(token)) {
  330. result.emplace_back(std::move(token));
  331. }
  332. return result;
  333. }
  334. // ----------------------------------------------------------------------------
  335. } // namespace detail
  336. } // namespace sage
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement