Advertisement
HITOA

BaseTokenizer (Header Only)

Jan 11th, 2022
181
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.94 KB | None | 0 0
  1. #pragma once
  2. #include <utility>
  3. #include <string_view>
  4. #include <vector>
  5. #include <assert.h>
  6.  
  7. template<typename cT, typename dT>
  8. class BaseTokenizer
  9. {
  10. public:
  11. using ValueType = cT;
  12. using DataType = dT;
  13. using StringType = std::basic_string_view<ValueType>;
  14. public:
  15. class Consumer {
  16. public:
  17. using TokenType = std::pair<StringType, DataType>;
  18. public:
  19. Consumer() = default;
  20. Consumer(const Consumer& consumer) : tokens{ consumer.tokens } {};
  21. Consumer(std::vector<TokenType> tokens) : tokens{ tokens } {};
  22. Consumer& operator=(const Consumer& consumer) { tokens = consumer.tokens; };
  23. ~Consumer() { tokens.clear(); };
  24. public:
  25. const TokenType& Peek() const
  26. {
  27. if (tokens.size() > 0)
  28. return tokens[0];
  29. return TokenType{};
  30. }
  31.  
  32. const TokenType& Peek(int n) const
  33. {
  34. if (tokens.size() > n)
  35. return tokens[n];
  36. return TokenType{};
  37. };
  38.  
  39. TokenType Consume()
  40. {
  41. if (tokens.size() == 0)
  42. return TokenType{};
  43. TokenType token = tokens[0];
  44. tokens.erase(tokens.begin());
  45. return token;
  46. };
  47.  
  48. TokenType Consume(int n)
  49. {
  50. if (tokens.size() <= n)
  51. return TokenType{};
  52. TokenType token = tokens[n];
  53. tokens.erase(tokens.begin() + n);
  54. return token;
  55. };
  56. private:
  57. std::vector<TokenType> tokens;
  58. };
  59. public:
  60. virtual std::pair<StringType, DataType> Tokenize(const StringType&) = 0;
  61.  
  62. std::vector<std::pair<StringType, DataType>> TokenizeAll(StringType str)
  63. {
  64. std::vector<std::pair<StringType, DataType>> tokens{};
  65.  
  66. for (int i = 0; i < str.size();) {
  67. StringType nstr{ str.data() + i, str.size() - i };
  68. std::pair<StringType, DataType> r = Tokenize(nstr);
  69.  
  70. assert(("tokenizer must return token with value greater than 0", r.first.length() > 0));
  71. assert(("tokenizer return value outside bound of string", r.first.length() + i <= str.length()));
  72.  
  73. tokens.push_back(r);
  74. i += r.first.length();
  75. }
  76.  
  77. return tokens;
  78. }
  79.  
  80. Consumer TokenizeAndGetConsumer(StringType str)
  81. {
  82. return Consumer{ TokenizeAll(str) };
  83. }
  84.  
  85. StringType TokenizeSymbol(const StringType& str)
  86. {
  87. return str.substr(0, 1);
  88. }
  89.  
  90. StringType TokenizeWord(const StringType& str, int size)
  91. {
  92. assert(("word length is greater than str length", str.length() >= size));
  93. return str.substr(0, size);
  94. }
  95.  
  96. StringType TokenizeEnclosure(const StringType& str, ValueType endv, bool takeLast = true)
  97. {
  98. assert(("length must be greater than 1", str.length() > 1));
  99. for (int i = 1; i < str.length(); i++)
  100. if (str[i] == endv)
  101. return str.substr(0, (size_t)(takeLast ? i + 1 : i));
  102. return StringType{};
  103. }
  104.  
  105. StringType TokenizeSequence(const StringType& str, std::vector<ValueType> values)
  106. {
  107. for (int i = 0; i < str.length(); i++)
  108. if (std::find(values.begin(), values.end(), str[i]) == values.end())
  109. return str.substr(0, i);
  110. return StringType{};
  111. }
  112.  
  113. StringType TryTokenizeSymbol(const StringType& str, std::vector<ValueType> symbols)
  114. {
  115. for (const ValueType& symbol : symbols)
  116. if (str[0] == symbol)
  117. return str.substr(0, 1);
  118. return StringType{};
  119. }
  120.  
  121. StringType TryTokenizeWord(const StringType& str, std::vector<StringType> words)
  122. {
  123. for (const StringType& word : words)
  124. if (str.length() >= word.length() && str.substr(0, word.length()) == word)
  125. return str.substr(0, word.length());
  126. return StringType{};
  127. }
  128.  
  129. StringType TryTokenizeEnclosure(const StringType& str, std::vector<ValueType> endvs, bool takeLast = true)
  130. {
  131. assert(("length must be greater than 1", str.length() > 1));
  132. for (int i = 1; i < str.length(); i++)
  133. if (std::find(endvs.begin(), endvs.end(), str[i]) != endvs.end())
  134. return str.substr(0, (size_t)(takeLast ? i + 1 : i));
  135. return StringType{};
  136. }
  137. };
  138.  
  139. template<typename dT>
  140. using U8BaseTokenizer = BaseTokenizer<char, dT>;
  141.  
  142. template<typename dT>
  143. using U16BaseTokenizer = BaseTokenizer<char16_t, dT>;
  144.  
  145. template<typename dT>
  146. using U32BaseTokenizer = BaseTokenizer<char32_t, dT>;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement