Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <vector>
- #include <map>
- #include <boost/utility/string_view.hpp>
- class TokenizerExample {
- public:
- boost::string_view operator()(boost::string_view& str,
- boost::string_view delimiter) const {
- boost::string_view retval;
- while (retval.empty()) {
- std::size_t pos = str.find_first_of(delimiter);
- if (pos == str.npos) {
- retval = str;
- str.clear();
- return retval;
- }
- retval = str.substr(0, pos);
- str.remove_prefix(pos + 1);
- }
- return retval;
- }
- };
- template<typename TokenizerType>
- std::vector<std::size_t> Encode(const std::string& str,
- const std::string& delimiter,
- TokenizerType tokenizer) {
- boost::string_view strView(str);
- boost::string_view delimiterView(delimiter);
- boost::string_view token;
- std::map<boost::string_view, std::size_t> dict;
- std::vector<std::size_t> result;
- token = tokenizer(strView, delimiterView);
- std::size_t numLabels = 0;
- while (!token.empty()) {
- if (dict.find(token) == dict.end()) {
- dict[token] = numLabels++;
- }
- result.push_back(dict.at(token));
- token = tokenizer(strView, delimiterView);
- }
- return result;
- }
- int main()
- {
- std::string tmp = "abc def abc \t abced def";
- std::vector<std::size_t> result = Encode(tmp, " \t\n", TokenizerExample());
- for (auto item : result)
- std::cout << item << std::endl;
- std::cout << "Example #2" << std::endl;
- std::vector<std::size_t> result2 =
- Encode(tmp, " \t\n", [](boost::string_view& str,
- boost::string_view ) {
- if (str.empty())
- return str;
- boost::string_view retval = str.substr(0, 1);
- str.remove_prefix(1);
- return retval;
- });
- for (auto item : result2)
- std::cout << item << std::endl;
- return 0;
- }
Add Comment
Please, Sign In to add comment