Advertisement
axyd

cpp_tokenizer_mine

May 22nd, 2022
1,073
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 6.77 KB | None | 0 0
  1. #include <iostream>
  2. #include <fstream>
  3. #include <regex>
  4. #include <vector>
  5. #include <string>
  6. #include <iomanip>
  7. using namespace std;
  8.  
  9.  
  10. /*Detects and merges binary operators*/
  11. smatch bin_ops_merger (vector<string>&, size_t);
  12.  
  13. /*Combines two vectors into a pair*/
  14. template<typename T,typename U>
  15. vector<pair<T,U>> merge_vectors (const vector<T>&, const vector<U>&);
  16.  
  17. /*Assigns Matching Tokens to Lexemes*/
  18. void tokenizer (vector<string>&, vector<pair<string,string>>&);
  19.  
  20.  
  21. int main(int argc, char** argv) {
  22.     //store full source file in a string
  23.     string file, line;
  24.     ifstream ifs("analyzethis.file");
  25.    
  26.     while (getline(ifs, line, '\0')){
  27.         file+= line;
  28.     }
  29.     ifs.close();
  30.    
  31.     /*Strip single and multi line comments*/
  32.     regex rexComments("(//.*)|(/\\*(?:.|[\\n\\r])*?\\*/)");
  33.  
  34.     string result;
  35.     regex_replace(std::back_inserter(result), file.begin(), file.end(), rexComments, " ");
  36.     file= result;
  37.    
  38.    
  39.     /* REGEX Patterns:
  40.      *      Not alphanumeric [\\W]
  41.      *      number: (\\d+).(\\d+)
  42.      *      string: (\".*\")
  43.      */
  44.  
  45.     regex rexPtrn("[\\W]|(\\d+).(\\d+)|(\".*\")");
  46.    
  47.    
  48.     //only alpha numeric, reverses negation
  49.     regex_token_iterator<string::iterator> rtiNS(file.begin(), file.end(), rexPtrn, -1);
  50.     //only symbols
  51.     regex_token_iterator<string::iterator> rtiS(file.begin(), file.end(), rexPtrn);
  52.     //end of line comparison
  53.     regex_token_iterator<string::iterator> rtiEnd;
  54.  
  55.    
  56.     vector<string> vData;   //holds token
  57.     while ((rtiNS!=rtiEnd)&&(rtiS!=rtiEnd)){       
  58.         if((*rtiNS).length()>0)                             //if not symbol
  59.             vData.push_back(*rtiNS);
  60.  
  61.         if((*rtiS).length()>0&&*rtiS!=" "&&*rtiS!="\t"&&*rtiS!="\n")
  62.             vData.push_back(*rtiS);
  63.  
  64.         //advance iterators
  65.         ++rtiNS;
  66.         ++rtiS;
  67.     }
  68.  
  69.    
  70.     //FIND BINARY OPERATORS AND COMBINE THEM
  71.     for (size_t ctr= 0; ctr < vData.size() - 1; ++ctr) {
  72.         bin_ops_merger(vData, ctr);
  73.     }
  74.  
  75.    
  76.     /*Holds final matches*/
  77.     vector<pair<string,string>> vTokenLexeme;
  78.     tokenizer(vData, vTokenLexeme);
  79.  
  80.  
  81.     //Save tokenized lexemes
  82.     ofstream ofs("tokenized.txt");
  83.    
  84.     ofs<<string(30, '=')<<endl;
  85.     ofs<<setw(17)<<right<<"Lexeme      ||"<<setw(10)<<"Token"<<endl;
  86.     ofs<<string(30, '=')<<endl;
  87.    
  88.     for(auto it : vTokenLexeme){
  89.         ofs<<setw(15)<<left<<it.first<<"||  "<<it.second<<endl;    
  90.     }
  91.    
  92.     ofs.close();
  93.    
  94.  
  95. //  system("PAUSE");
  96.     return 0;  
  97. }
  98.  
  99.  
  100. /*Combines two vectors into a pair*/
  101. template<typename T,typename U>
  102. vector<pair<T,U>> merge_vectors (const vector<T>& v1, const vector<U>& v2) {
  103.     vector<pair<string, string>> vOut;
  104.    
  105.     for(size_t i= 0; i< v1.size(); ++i){
  106.         vOut.emplace_back(v1.at(i), v2.at(i));
  107.     }  
  108.     return vOut;
  109. }
  110.  
  111. /*Detects and merges binary operators*/
  112. smatch bin_ops_merger (vector<string>& vData, size_t ctr) {
  113.     regex binaryOperatorsPattern("\\+=|-=|\\*=|/=|%=|&=|\\!=|==|\\|=|\\^="
  114.         "|<=|>=|--|\\+\\+|<<|>>|&&|\\|\\||->");
  115.    
  116.     vector <string>::iterator curr, next;
  117.     curr= vData.begin()+ctr;
  118.     next= vData.begin()+ctr+1;
  119.    
  120.     string str= *curr+*next;    //run regex pattern on this string 
  121.     smatch binOpsMatch;         //stores the matched partion
  122.    
  123.     if (regex_match(str, binOpsMatch, binaryOperatorsPattern)) {
  124.         *curr= *curr + *next;                   //merge operators
  125.         vData.erase(next);                      //delete extra element
  126.     }
  127.    
  128.     return binOpsMatch;
  129. }
  130.  
  131.  
  132. /*Assigns Matching Tokens to Lexemes*/
  133. void tokenizer (vector<string>& vLex, vector<pair<string,string>>& vTknLex) {
  134.     //Reserved Key Words
  135.     vector<string> vKword;
  136.     vKword={"string","include","auto","const","struct","unsigned","break",
  137.         "continue","else","for","signed","switch","void","case","default",
  138.         "enum","goto","register","sizeof","typedef","volatile","char","do",
  139.         "extern","if","return","static","union","while","asm","dynamic_cast",
  140.         "namespace","reinterpret_cast","try","bool","explicit","new","template",
  141.         "static_cast","typeid","catch","false","operator","typename","public",
  142.         "class","friend","private","this","using","const_cast","inline","throw",
  143.         "virtual","delete","mutable","protected","true","elseif"};
  144.  
  145.     vector<string> vDataTypes;
  146.     vDataTypes={"double","float","int","short","size_t","long","string"};
  147.    
  148.  
  149.     //Binary Operators
  150.     vector<string> vbotkn, vbolex;
  151.     vbotkn={"+=", "-=", "*=", "/=", "%=", "&=", "!=", "==", "|=", "^=", "<=",
  152.         ">=","--", "++", "<<", ">>", "&&", "||", "->",":"};
  153.     vbolex={"ADD_ASSIGN","SUB_ASSIGN","MUL_ASSIGN","DIV_ASSIGN","MOD_ASSIGN",
  154.         "AND_ASSIGN","LOGIC_INEQ","LOGIC_EQ","OR_ASSIGN","XOR_ASSIGN",
  155.         "LESS_OR_EQ","MORE_OR_EQ","DECREMENT","INCREMENT","INSERTION",
  156.         "EXTRACTION","LOGIC_AND","LOGIC_OR","MEMBER_PTR","SCOPE_RES"};
  157.     vector<pair<string,string>> vBOpsTokens= merge_vectors(vbotkn, vbolex);
  158.    
  159.     //Unary Symbols
  160.     vector<string> vsymtkn, vsymlex;
  161.     vsymtkn={".","#",",","=","-","+","/","*","%","(",")","{","}","[","]","~",
  162.         "^","|","&","?",":",";","!",">","<"};
  163.     vsymlex={"MEMBER_OBJ","PREPROC","SEPARATOR","ASSIGN","SUB","ADD","DIV",
  164.         "MUL_OR_DEREF","MOD","L_PAREN","R_PAREN","L_BRACE","R_BRACE","L_BRACKET",
  165.         "R_BRACKET","COMPLEMENT","XOR","OR","AND","CONDITIONAL","COND_SEP",
  166.         "SEMI_COLON","NOT","GREATER_THAN","LESS_THAN"};
  167.     vector<pair<string,string>> vUnaryTokens= merge_vectors(vsymtkn, vsymlex);
  168.    
  169.     //Library Objects
  170.     vector<string> vLibObj;
  171.     vLibObj={"cout","cin","printf","size","sizeof","system","getline","endl",
  172.         "to_string"};
  173.    
  174.     /*
  175.      * Search lexemes for token matches
  176.      *
  177.      */
  178.     for(size_t lexItr= 0; lexItr<vLex.size(); ++lexItr){
  179.         bool found= false;
  180.        
  181.         //match string
  182.         regex rexStr("(\".*\")");
  183.         if(regex_match(vLex[lexItr], rexStr)){
  184.             vTknLex.emplace_back(vLex.at(lexItr), "STRING_LIT");
  185.             found= true;
  186.         }
  187.         if(found) continue;
  188.        
  189.         //match numbers
  190.         regex rexNum("(\\d)|(\\d+.\\d+)");
  191.         if(regex_match(vLex[lexItr], rexNum)){
  192.             vTknLex.emplace_back(vLex.at(lexItr), "NUMERIC");
  193.             found= true;
  194.         }
  195.         if(found) continue;
  196.        
  197.         //match keywords
  198.         for(auto it : vKword){
  199.             if(vLex[lexItr]==it){
  200.                 vTknLex.emplace_back(vLex.at(lexItr), "KEYWORD");
  201.                 found= true;
  202.                 break;
  203.             }
  204.         }
  205.         if(found) continue;
  206.        
  207.         //match data types
  208.         for(auto it : vDataTypes){
  209.             if(vLex[lexItr]==it){
  210.                 vTknLex.emplace_back(it, "PRIM_DTYPE");
  211.                 found= true;
  212.                 break;
  213.             }
  214.         }
  215.         if(found) continue;
  216.        
  217.        
  218.         //match binary operators
  219.         for(auto it : vBOpsTokens){
  220.             if(vLex[lexItr]==it.first){
  221.                 vTknLex.emplace_back(it.first, it.second);
  222.                 found= true;
  223.                 break;
  224.             }
  225.         }
  226.         if(found) continue;
  227.        
  228.         //match unary operators
  229.         for(auto it : vUnaryTokens){
  230.             if(vLex[lexItr]==it.first){
  231.                 vTknLex.emplace_back(it.first, it.second);
  232.                 found= true;
  233.                 break;
  234.             }
  235.         }
  236.         if(found) continue;
  237.        
  238.         //match library objects
  239.         for(auto it : vLibObj){
  240.             if(vLex[lexItr]==it){
  241.                 vTknLex.emplace_back(it, "LIB_OBJ");
  242.                 found= true;
  243.                 break;
  244.             }
  245.         }
  246.         if(found) continue;
  247.        
  248.  
  249.        
  250.         //left overs are identifiers
  251.         vTknLex.emplace_back(vLex.at(lexItr), "IDENTIFIER");
  252.     }
  253. }
  254.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement