cpp_tokenizer_mine

#include <iostream>
#include <fstream>
#include <regex>
#include <vector>
#include <string>
#include <iomanip>
using namespace std;


/*Detects and merges binary operators*/
smatch bin_ops_merger (vector<string>&, size_t);

/*Combines two vectors into a pair*/
template<typename T,typename U>
vector<pair<T,U>> merge_vectors (const vector<T>&, const vector<U>&);

/*Assigns Matching Tokens to Lexemes*/
void tokenizer (vector<string>&, vector<pair<string,string>>&);


int main(int argc, char** argv) {
    //store full source file in a string
    string file, line;
    ifstream ifs("analyzethis.file");

    while (getline(ifs, line, '\0')){
        file+= line;
    }
    ifs.close();

    /*Strip single and multi line comments*/
    regex rexComments("(//.*)|(/\\*(?:.|[\\n\\r])*?\\*/)");

    string result;
    regex_replace(std::back_inserter(result), file.begin(), file.end(), rexComments, " ");
    file= result;


    /* REGEX Patterns:
     *      Not alphanumeric [\\W]
     *      number: (\\d+).(\\d+)
     *      string: (\".*\")
     */

    regex rexPtrn("[\\W]|(\\d+).(\\d+)|(\".*\")");


    //only alpha numeric, reverses negation
    regex_token_iterator<string::iterator> rtiNS(file.begin(), file.end(), rexPtrn, -1);
    //only symbols
    regex_token_iterator<string::iterator> rtiS(file.begin(), file.end(), rexPtrn);
    //end of line comparison
    regex_token_iterator<string::iterator> rtiEnd;


    vector<string> vData;   //holds token
    while ((rtiNS!=rtiEnd)&&(rtiS!=rtiEnd)){
        if((*rtiNS).length()>0)                             //if not symbol
            vData.push_back(*rtiNS);

        if((*rtiS).length()>0&&*rtiS!=" "&&*rtiS!="\t"&&*rtiS!="\n")
            vData.push_back(*rtiS);

        //advance iterators
        ++rtiNS;
        ++rtiS;
    }


    //FIND BINARY OPERATORS AND COMBINE THEM
    for (size_t ctr= 0; ctr < vData.size() - 1; ++ctr) {
        bin_ops_merger(vData, ctr);
    }


    /*Holds final matches*/
    vector<pair<string,string>> vTokenLexeme;
    tokenizer(vData, vTokenLexeme);


    //Save tokenized lexemes
    ofstream ofs("tokenized.txt");

    ofs<<string(30, '=')<<endl;
    ofs<<setw(17)<<right<<"Lexeme      ||"<<setw(10)<<"Token"<<endl;
    ofs<<string(30, '=')<<endl;

    for(auto it : vTokenLexeme){
        ofs<<setw(15)<<left<<it.first<<"||  "<<it.second<<endl;
    }

    ofs.close();


//  system("PAUSE");
    return 0;
}


/*Combines two vectors into a pair*/
template<typename T,typename U>
vector<pair<T,U>> merge_vectors (const vector<T>& v1, const vector<U>& v2) {
    vector<pair<string, string>> vOut;

    for(size_t i= 0; i< v1.size(); ++i){
        vOut.emplace_back(v1.at(i), v2.at(i));
    }
    return vOut;
}

/*Detects and merges binary operators*/
smatch bin_ops_merger (vector<string>& vData, size_t ctr) {
    regex binaryOperatorsPattern("\\+=|-=|\\*=|/=|%=|&=|\\!=|==|\\|=|\\^="
        "|<=|>=|--|\\+\\+|<<|>>|&&|\\|\\||->");

    vector <string>::iterator curr, next;
    curr= vData.begin()+ctr;
    next= vData.begin()+ctr+1;

    string str= *curr+*next;    //run regex pattern on this string
    smatch binOpsMatch;         //stores the matched partion

    if (regex_match(str, binOpsMatch, binaryOperatorsPattern)) {
        *curr= *curr + *next;                   //merge operators
        vData.erase(next);                      //delete extra element
    }

    return binOpsMatch;
}


/*Assigns Matching Tokens to Lexemes*/
void tokenizer (vector<string>& vLex, vector<pair<string,string>>& vTknLex) {
    //Reserved Key Words
    vector<string> vKword;
    vKword={"string","include","auto","const","struct","unsigned","break",
        "continue","else","for","signed","switch","void","case","default",
        "enum","goto","register","sizeof","typedef","volatile","char","do",
        "extern","if","return","static","union","while","asm","dynamic_cast",
        "namespace","reinterpret_cast","try","bool","explicit","new","template",
        "static_cast","typeid","catch","false","operator","typename","public",
        "class","friend","private","this","using","const_cast","inline","throw",
        "virtual","delete","mutable","protected","true","elseif"};

    vector<string> vDataTypes;
    vDataTypes={"double","float","int","short","size_t","long","string"};


    //Binary Operators
    vector<string> vbotkn, vbolex;
    vbotkn={"+=", "-=", "*=", "/=", "%=", "&=", "!=", "==", "|=", "^=", "<=",
        ">=","--", "++", "<<", ">>", "&&", "||", "->",":"};
    vbolex={"ADD_ASSIGN","SUB_ASSIGN","MUL_ASSIGN","DIV_ASSIGN","MOD_ASSIGN",
        "AND_ASSIGN","LOGIC_INEQ","LOGIC_EQ","OR_ASSIGN","XOR_ASSIGN",
        "LESS_OR_EQ","MORE_OR_EQ","DECREMENT","INCREMENT","INSERTION",
        "EXTRACTION","LOGIC_AND","LOGIC_OR","MEMBER_PTR","SCOPE_RES"};
    vector<pair<string,string>> vBOpsTokens= merge_vectors(vbotkn, vbolex);

    //Unary Symbols
    vector<string> vsymtkn, vsymlex;
    vsymtkn={".","#",",","=","-","+","/","*","%","(",")","{","}","[","]","~",
        "^","|","&","?",":",";","!",">","<"};
    vsymlex={"MEMBER_OBJ","PREPROC","SEPARATOR","ASSIGN","SUB","ADD","DIV",
        "MUL_OR_DEREF","MOD","L_PAREN","R_PAREN","L_BRACE","R_BRACE","L_BRACKET",
        "R_BRACKET","COMPLEMENT","XOR","OR","AND","CONDITIONAL","COND_SEP",
        "SEMI_COLON","NOT","GREATER_THAN","LESS_THAN"};
    vector<pair<string,string>> vUnaryTokens= merge_vectors(vsymtkn, vsymlex);

    //Library Objects
    vector<string> vLibObj;
    vLibObj={"cout","cin","printf","size","sizeof","system","getline","endl",
        "to_string"};

    /*
     * Search lexemes for token matches
     *
     */
    for(size_t lexItr= 0; lexItr<vLex.size(); ++lexItr){
        bool found= false;

        //match string
        regex rexStr("(\".*\")");
        if(regex_match(vLex[lexItr], rexStr)){
            vTknLex.emplace_back(vLex.at(lexItr), "STRING_LIT");
            found= true;
        }
        if(found) continue;

        //match numbers
        regex rexNum("(\\d)|(\\d+.\\d+)");
        if(regex_match(vLex[lexItr], rexNum)){
            vTknLex.emplace_back(vLex.at(lexItr), "NUMERIC");
            found= true;
        }
        if(found) continue;

        //match keywords
        for(auto it : vKword){
            if(vLex[lexItr]==it){
                vTknLex.emplace_back(vLex.at(lexItr), "KEYWORD");
                found= true;
                break;
            }
        }
        if(found) continue;

        //match data types
        for(auto it : vDataTypes){
            if(vLex[lexItr]==it){
                vTknLex.emplace_back(it, "PRIM_DTYPE");
                found= true;
                break;
            }
        }
        if(found) continue;


        //match binary operators
        for(auto it : vBOpsTokens){
            if(vLex[lexItr]==it.first){
                vTknLex.emplace_back(it.first, it.second);
                found= true;
                break;
            }
        }
        if(found) continue;

        //match unary operators
        for(auto it : vUnaryTokens){
            if(vLex[lexItr]==it.first){
                vTknLex.emplace_back(it.first, it.second);
                found= true;
                break;
            }
        }
        if(found) continue;

        //match library objects
        for(auto it : vLibObj){
            if(vLex[lexItr]==it){
                vTknLex.emplace_back(it, "LIB_OBJ");
                found= true;
                break;
            }
        }
        if(found) continue;


        //left overs are identifiers
        vTknLex.emplace_back(vLex.at(lexItr), "IDENTIFIER");
    }
}