Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <string>
- #include <iostream>
- #include <list>
- #include <vector>
- #include <map>
- #include <regex>
- #include <assert.h>
- #include <fstream>
- #include <set>
- #include <readline/readline.h>
- #include <readline/history.h>
- #include <boost/regex.hpp>
- using namespace std;
- class Tokenizer {
- public:
- Tokenizer() {}
- struct TokenExpr {
- boost::regex TOKEN_STRING {"\".*?\""};
- boost::regex TOKEN_PARAM {"\\(.*?\\)"};
- boost::regex TOKEN_BRACKETS {"\\{.*?\\}"};
- boost::regex TOKEN_FILE {"\\w*\\.\\w*"};
- boost::regex TOKEN_WORD {"[^\\W\\s]\\w+"};
- boost::regex TOKEN_INT {"\\b\\d*?\\.*?\\d?\\b"};
- boost::regex TOKEN_DASH {"-\\b\\w+\\s"};
- boost::regex TOKEN_EQUAL {"="};
- boost::regex TOKEN_QUOTE {"\""};
- boost::regex TOKEN_COMMA {","};
- boost::regex TOKEN_OP {"(\\+|-|\\*|\\/|%)"};
- boost::regex TOKEN_PIPE {"\\|"};
- boost::regex TOKEN_AMPERSAND {"\\&"};
- boost::regex TOKEN_PAREN_LEFT {"\\("};
- boost::regex TOKEN_PAREN_RIGHT {"\\)"};
- } te;
- struct TokenType {
- string TOKEN;
- } tt;
- TokenType TOKEN_STRING = {"STRING"},
- TOKEN_PARAM = {"PARAM"},
- TOKEN_BRACKETS = {"BRACKETS"},
- TOKEN_FILE = {"FILE"},
- TOKEN_WORD = {"WORD"},
- TOKEN_INT = {"INT"},
- TOKEN_DASH = {"DASH"},
- TOKEN_EQUAL = {"EQUAL"},
- TOKEN_QUOTE = {"QUOTE"},
- TOKEN_COMMA = {"COMMA"},
- TOKEN_OP = {"OP"},
- TOKEN_PIPE = {"PIPE"},
- TOKEN_AMPERSAND = {"AMPERSAND"},
- TOKEN_PAREN_LEFT = {"PAREN_LEFT"},
- TOKEN_PAREN_RIGHT = {"PAREN_RIGHT"};
- struct TokenMap {
- string str;
- int begin_pos;
- int end_pos;
- int index;
- TokenType tt;
- };
- TokenMap tmap;
- TokenType get_token_type(string token) {
- string::const_iterator start, end;
- start = token.begin();
- end = token.end();
- boost::match_results<string::const_iterator> what;
- while (regex_search(start, end, what, te.TOKEN_STRING)) {
- cout << "TOKEN_STRING" << endl;
- start = what[0].second;
- return TOKEN_STRING;
- }
- while (regex_search(start, end, what, te.TOKEN_PARAM)) {
- cout << "TOKEN_PARAM" << endl;
- start = what[0].second;
- return TOKEN_PARAM;
- }
- while (regex_search(start, end, what, te.TOKEN_BRACKETS)) {
- cout << "TOKEN_BRACKETS" << endl;
- start = what[0].second;
- return TOKEN_BRACKETS;
- }
- while (regex_search(start, end, what, te.TOKEN_FILE)) {
- cout << "TOKEN_FILE" << endl;
- start = what[0].second;
- return TOKEN_FILE;
- }
- while (regex_search(start, end, what, te.TOKEN_WORD)) {
- cout << "TOKEN_WORD" << endl;
- start = what[0].second;
- return TOKEN_WORD;
- }
- while (regex_search(start, end, what, te.TOKEN_INT)) {
- cout << "TOKEN_INT" << endl;
- start = what[0].second;
- return TOKEN_INT;
- }
- while (regex_search(start, end, what, te.TOKEN_DASH)) {
- cout << "TOKEN_DASH" << endl;
- start = what[0].second;
- return TOKEN_DASH;
- }
- while (regex_search(start, end, what, te.TOKEN_EQUAL)) {
- cout << "TOKEN_EQUAL" << endl;
- start = what[0].second;
- return TOKEN_EQUAL;
- }
- while (regex_search(start, end, what, te.TOKEN_QUOTE)) {
- cout << "TOKEN_QUOTE" << endl;
- start = what[0].second;
- return TOKEN_QUOTE;
- }
- while (regex_search(start, end, what, te.TOKEN_COMMA)) {
- cout << "TOKEN_COMMA" << endl;
- start = what[0].second;
- return TOKEN_COMMA;
- }
- while (regex_search(start, end, what, te.TOKEN_OP)) {
- cout << "TOKEN_OP" << endl;
- start = what[0].second;
- return TOKEN_OP;
- }
- while (regex_search(start, end, what, te.TOKEN_PIPE)) {
- cout << "TOKEN_PIPE" << endl;
- start = what[0].second;
- return TOKEN_PIPE;
- }
- while (regex_search(start, end, what, te.TOKEN_AMPERSAND)) {
- cout << "TOKEN_AMPERSAND" << endl;
- start = what[0].second;
- return TOKEN_AMPERSAND;
- }
- while (regex_search(start, end, what, te.TOKEN_PAREN_LEFT)) {
- cout << "TOKEN_PAREN_LEFT" << endl;
- start = what[0].second;
- return TOKEN_PAREN_LEFT;
- }
- while (regex_search(start, end, what, te.TOKEN_PAREN_RIGHT)) {
- cout << "TOKEN_PAREN_RIGHT" << endl;
- start = what[0].second;
- return TOKEN_PAREN_RIGHT;
- }
- }
- void tokenize(char* s) {
- str_ = string(s);
- vector<string> vec;
- boost::regex re {("\\w+|\\W")};
- boost::sregex_token_iterator i(str_.begin(), str_.end(), re);
- boost::sregex_token_iterator j;
- string t;
- string str = "";
- TokenType last_type;
- bool QUOTED = false;
- bool PARENS = false;
- bool BRACKETS = false;
- unsigned count = 0;
- int first_pos = 0;
- while (i != j) {
- if (*i != " " && *i != "\"" && *i != "(" && *i != ")" && *i != "{"
- && *i != "}" && QUOTED == false && PARENS == false && BRACKETS == false) {
- vec.push_back(*i);
- t = *i;
- tmap.str = *i;
- tmap.begin_pos = first_pos;
- tmap.end_pos = first_pos + t.size();
- tmap.index++;
- tmap.tt = get_token_type(t);
- last_type = tmap.tt;
- i++;
- count++;
- } else if ((*i != "\"" && QUOTED == true)
- || *i != ")" && PARENS == true
- || *i != "}" && BRACKETS == true) {
- str += *i;
- i++;
- } else if ((*i == "\"" && QUOTED == true)
- || *i == ")" && PARENS == true
- || *i == "}" && BRACKETS == true) {
- str += *i;
- vec.push_back(str);
- tmap.str = str;
- tmap.begin_pos = first_pos;
- tmap.end_pos = first_pos + str.size();
- tmap.index++;
- tmap.tt = get_token_type(str);
- last_type = tmap.tt;
- i++;
- count++;
- QUOTED = false;
- PARENS = false;
- BRACKETS = false;
- str = "";
- } else if (*i == "\"" && QUOTED == false) {
- str += *i;
- QUOTED = true;
- i++;
- } else if (*i == "(" && PARENS == false) {
- str += *i;
- PARENS = true;
- i++;
- } else if (*i == "{" && BRACKETS == false) {
- str += *i;
- BRACKETS = true;
- i++;
- } else {
- i++;
- }
- }
- for (int n=0;n<vec.size();n++) {
- cout << vec[n] << endl;
- }
- cout << "There were " << count << " tokens found." << endl;
- prompt_token_list = vec;
- }
- private:
- string str_;
- unsigned count;
- vector<string> prompt_token_list;
- };
- int main()
- {
- char* buf;
- Tokenizer tk;
- while ((buf = readline("Shell>> ")) != nullptr) {
- if (strlen(buf) > 0) {
- add_history(buf);
- tk.tokenize(buf);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement