C++ Lexical Analyser for JavaScript : lexical.cpp

/* lexical.cpp -- version 0.2, May 1st, 2015

  Copyright (C) 2015 Raphaël Dujardin

  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the authors be held liable for any damages
  arising from the use of this software.

  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:

  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.

  Raphaël Dujardin
  rdujardin.com

*/

#include "lexical.h"

namespace lex
{

    Token::Token(TokenType _type,std::string _value)
    {
        type=_type;
        value=_value;
    }

    Token::Token(const Token& token)
    {
        (*this)=token;
    }

    Token::~Token()
    {
        //.
    }

    void Token::print(std::ostream& out,bool withValue,bool withEol) const
    {
        switch(type)
        {
        case NULLTOKEN:
            out << "NULLTOKEN";
            if(withEol) out << std::endl;
            break;
        case INTEGER:
            out << "INTEGER";
            if(withValue) out << " # " << value;
            if(withEol) out << std::endl;
            break;
        case FLOATING:
            out << "FLOATING";
            if(withValue) out << " # " << value;
            if(withEol) out << std::endl;
            break;
        case STRING:
            out << "STRING";
            if(withValue) out << " # " << value;
            if(withEol) out << std::endl;
            break;
        case NAME:
            out << "NAME";
            if(withValue) out << " # " << value;
            if(withEol) out << std::endl;
            break;
        case PLUS:
            out << "PLUS";
            if(withEol) out << std::endl;
            break;
        case MINUS:
            out << "MINUS";
            if(withEol) out << std::endl;
            break;
        case MULTIPLY:
            out << "MULTIPLY";
            if(withEol) out << std::endl;
            break;
        case DIVIDE:
            out << "DIVIDE";
            if(withEol) out << std::endl;
            break;
        case MODULO:
            out << "MODULO";
            if(withEol) out << std::endl;
            break;
        case EQUAL:
            out << "EQUAL";
            if(withEol) out << std::endl;
            break;
        case LPARENTHESIS:
            out << "LPARENTHESIS";
            if(withEol) out << std::endl;
            break;
        case RPARENTHESIS:
            out << "RPARENTHESIS";
            if(withEol) out << std::endl;
            break;
        case LBRACKET:
            out << "LBRACKET";
            if(withEol) out << std::endl;
            break;
        case RBRACKET:
            out << "RBRACKET";
            if(withEol) out << std::endl;
            break;
        case LSQUARE:
            out << "LSQUARE";
            if(withEol) out << std::endl;
            break;
        case RSQUARE:
            out << "RSQUARE";
            if(withEol) out << std::endl;
            break;
        case COMMA:
            out << "COMMA";
            if(withEol) out << std::endl;
            break;
        case SEMICOLON:
            out << "SEMICOLON";
            if(withEol) out << std::endl;
            break;
        case INFERIOR:
            out << "INFERIOR";
            if(withEol) out << std::endl;
            break;
        case SUPERIOR:
            out << "SUPERIOR";
            if(withEol) out << std::endl;
            break;
        case BREAK:
            out << "BREAK";
            if(withEol) out << std::endl;
            break;
        case CASE:
            out << "CASE";
            if(withEol) out << std::endl;
            break;
        case CATCH:
            out << "CATCH";
            if(withEol) out << std::endl;
            break;
        case CONST:
            out << "CONST";
            if(withEol) out << std::endl;
            break;
        case CONTINUE:
            out << "CONTINUE";
            if(withEol) out << std::endl;
            break;
        case DEBUGGER:
            out << "DEBUGGER";
            if(withEol) out << std::endl;
            break;
        case DO:
            out << "DO";
            if(withEol) out << std::endl;
            break;
        case ELSE:
            out << "ELSE";
            if(withEol) out << std::endl;
            break;
        case FINALLY:
            out << "FINALLY";
            if(withEol) out << std::endl;
            break;
        case FOR:
            out << "FOR";
            if(withEol) out << std::endl;
            break;
        case FUNCTION:
            out << "FUNCTION";
            if(withEol) out << std::endl;
            break;
        case IF:
            out << "IF";
            if(withEol) out << std::endl;
            break;
        case IN:
            out << "IN";
            if(withEol) out << std::endl;
            break;
        case INSTANCEOF:
            out << "INSTANCEOF";
            if(withEol) out << std::endl;
            break;
        case LET:
            out << "LET";
            if(withEol) out << std::endl;
            break;
        case NEW:
            out << "NEW";
            if(withEol) out << std::endl;
            break;
        case RETURN:
            out << "RETURN";
            if(withEol) out << std::endl;
            break;
        case SWITCH:
            out << "SWITCH";
            if(withEol) out << std::endl;
            break;
        case THIS:
            out << "THIS";
            if(withEol) out << std::endl;
            break;
        case THROW:
            out << "THROW";
            if(withEol) out << std::endl;
            break;
        case TRY:
            out << "TRY";
            if(withEol) out << std::endl;
            break;
        case TYPEOF:
            out << "TYPEOF";
            if(withEol) out << std::endl;
            break;
        case VAR:
            out << "VAR";
            if(withEol) out << std::endl;
            break;
        case VOID:
            out << "VOID";
            if(withEol) out << std::endl;
            break;
        case WHILE:
            out << "WHILE";
            if(withEol) out << std::endl;
            break;
        case DBEQUAL:
            out << "DBEQUAL";
            if(withEol) out << std::endl;
            break;
        case DBPLUS:
            out << "DBPLUS";
            if(withEol) out << std::endl;
            break;
        case DBMINUS:
            out << "DBMINUS";
            if(withEol) out << std::endl;
            break;
        default:
            out << "?ERROR?";
            if(withEol) out << std::endl;
            break;
        }
    }

    void Token::printType(std::ostream& out) const
    {
        print(out,false);
    }

    void Token::printCode(std::ostream& out) const
    {
        out << value;
    }

    Token& Token::operator=(const Token &token)
    {
        type=token.type;
        value=token.value;
        return *this;
    }

    Sequence::Sequence()
    {
        //.
    }

    Sequence::Sequence(const std::string& code)
    {
        lex(code);
    }

    Sequence::Sequence(const Token& token)
    {
        (*this)+=token;
    }

    Sequence::Sequence(const Sequence& sequence)
    {
        (*this)=sequence;
    }

    Sequence::~Sequence()
    {
        //.
    }

    void Sequence::print(std::ostream& out,bool compact) const
    {
        if(!compact) out << std::endl;
        out << "[";
        if(!compact) out << std::endl;
        for(std::deque<Token>::const_iterator it=tokens.begin();it!=tokens.end();it++)
        {
            (*it).print(out,true,!compact);
            out << ((compact)?((it+1==tokens.end())?" ":" | "):"");
        }
        if(!compact) out << std::endl;
        out << "]";
        if(!compact) out << std::endl;
    }

    void Sequence::printCode(std::ostream& out) const
    {
        for(std::deque<Token>::const_iterator it=tokens.begin();it!=tokens.end();it++)
        {
            it->printCode(out);
        }
    }

    Sequence& Sequence::operator=(const Sequence &sequence)
    {
        tokens=sequence.tokens;
        return *this;
    }

    Sequence& Sequence::operator+=(const Sequence &sequence)
    {
        tokens.insert(tokens.end(),sequence.tokens.begin(),sequence.tokens.end());
        return *this;
    }

    Sequence& Sequence::operator+=(const Token &token)
    {
        tokens.insert(tokens.end(),token);
        return *this;
    }

    std::string Sequence::toStr(const std::vector<char> &buf)
    {
        return std::string(buf.begin(),buf.end());
    }

    std::pair<TokenType,std::vector<char>::const_iterator> Sequence::numberBuf(const std::vector<char> &buf)
    {
        bool isInteger=true;
        for(std::vector<char>::const_iterator it=buf.begin();it!=buf.end();it++)
        {
            if(*it<0x30 or *it>0x39)
            {
                if(*it=='.' and isInteger) isInteger=false;
                else return std::pair<TokenType,std::vector<char>::const_iterator>((isInteger)?INTEGER:FLOATING,it);
            }
        }
        return std::pair<TokenType,std::vector<char>::const_iterator>((isInteger)?INTEGER:FLOATING,buf.end());
    }

    Sequence Sequence::analyzeBuf(const std::vector<char> &buf)
    {
        std::pair<TokenType,std::vector<char>::const_iterator> num=numberBuf(buf);
        if(num.second==buf.end()) return Sequence(Token(num.first,toStr(buf)));
        else
        {
            if(num.second!=buf.begin())
            {
                return Sequence(Token(num.first,std::string(buf.begin(),num.second)))+Token(NAME,std::string(num.second,buf.end()));
            }
        }
        std::string s=toStr(buf);
        if(s=="==") return Sequence(Token(DBEQUAL));
        if(s=="break") return Sequence(Token(BREAK));
        if(s=="case") return Sequence(Token(CASE));
        if(s=="catch") return Sequence(Token(CATCH));
        if(s=="const") return Sequence(Token(CONST));
        if(s=="continue") return Sequence(Token(CONTINUE));
        if(s=="debugger") return Sequence(Token(DEBUGGER));
        if(s=="do") return Sequence(Token(DO));
        if(s=="else") return Sequence(Token(ELSE));
        if(s=="finally") return Sequence(Token(FINALLY));
        if(s=="for") return Sequence(Token(FOR));
        if(s=="function") return Sequence(Token(FUNCTION));
        if(s=="if") return Sequence(Token(IF));
        if(s=="in") return Sequence(Token(IN));
        if(s=="instanceof") return Sequence(Token(INSTANCEOF));
        if(s=="let") return Sequence(Token(LET));
        if(s=="new") return Sequence(Token(NEW));
        if(s=="return") return Sequence(Token(RETURN));
        if(s=="switch") return Sequence(Token(SWITCH));
        if(s=="this") return Sequence(Token(THIS));
        if(s=="throw") return Sequence(Token(THROW));
        if(s=="try") return Sequence(Token(TRY));
        if(s=="typeof") return Sequence(Token(TYPEOF));
        if(s=="var") return Sequence(Token(VAR));
        if(s=="void") return Sequence(Token(VOID));
        if(s=="while") return Sequence(Token(WHILE));
        return Sequence(Token(NAME,s));
    }

    void Sequence::lex(const std::string& code)
    {
        std::vector<char> buffer;
        char temp=0;
        bool multiCommentary=false, lineCommentary=false, quoteString=false, dbQuoteString=false;

        for(std::string::const_iterator it=code.begin();it!=code.end();it++)
        {
            if(multiCommentary)
            {
                if(*it=='/' and temp=='*') { temp=0; multiCommentary=false; }
                if(*it=='*') temp=*it;
                else temp=0;
            }
            else if(lineCommentary)
            {
                if(*it=='\n') { lineCommentary=false; }
            }
            else if(quoteString)
            {
                if(*it=='\'' and temp!='\\')
                {
                    temp=0; quoteString=false;
                    (*this)+=Token(STRING,toStr(buffer));
                    buffer.clear();
                }
                else
                {
                    if(*it=='\\') { temp=*it; }
                    else { temp=0; }
                    buffer.push_back(*it);
                }
            }
            else if(dbQuoteString)
            {
                if(*it=='"' and temp!='\\')
                {
                    temp=0; dbQuoteString=false;
                    (*this)+=Token(STRING,toStr(buffer));
                    buffer.clear();
                }
                else
                {
                    if(*it=='\\') { temp=*it; }
                    else { temp=0; buffer.push_back(*it); }
                }
            }
            else
            {
                if(*it=='<' or *it=='>' or *it==' ' or *it=='\n' or *it=='\r' or *it=='\t' or *it=='+' or *it=='-' or *it=='*' or *it=='/' or *it=='%' or *it=='=' or *it=='(' or *it==')' or *it=='{' or *it=='}' or *it=='[' or *it==']' or *it==',' or *it==';' or *it=='\'' or *it=='"')
                {
                    if(!buffer.empty())
                    {
                        (*this)+=analyzeBuf(buffer);
                        buffer.clear();
                    }

                    Token token;
                    bool add=true;
                    if(*it=='<') token.type=INFERIOR;
                    if(*it=='>') token.type=SUPERIOR;
                    if(*it=='+') { if(tokens.back().type==PLUS) { tokens[tokens.size()-1].type=DBPLUS; add=false; } else token.type=PLUS;}
                    if(*it=='-') { if(tokens.back().type==MINUS) { tokens[tokens.size()-1].type=DBMINUS; add=false; } else token.type=MINUS;}
                    if(*it=='*') { if(tokens.back().type==DIVIDE) { tokens.pop_back(); multiCommentary=true; add=false; } else token.type=MULTIPLY; }
                    if(*it=='/') { if(tokens.back().type==DIVIDE) { tokens.pop_back(); lineCommentary=true; add=false; } else token.type=DIVIDE; }
                    if(*it=='%') token.type=MODULO;
                    if(*it=='\'') { quoteString=true; add=false; }
                    if(*it=='"') { dbQuoteString=true; add=false; }
                    if(*it==',') token.type=COMMA;
                    if(*it==';') token.type=SEMICOLON;
                    if(*it=='(') token.type=LPARENTHESIS;
                    if(*it==')') token.type=RPARENTHESIS;
                    if(*it=='{') token.type=LBRACKET;
                    if(*it=='}') token.type=RBRACKET;
                    if(*it=='[') token.type=LSQUARE;
                    if(*it==']') token.type=RSQUARE;
                    if(*it=='=') { if(tokens.back().type==EQUAL) { tokens[tokens.size()-1].type=DBEQUAL; add=false; } else token.type=EQUAL;}

                    if(*it!=' ' && *it!='\n' && *it!='\r' && *it!='\t' && add) { (*this)+=token; }
                }
                else buffer.push_back(*it);
            }
        }
        if(!buffer.empty())
        {
            (*this)+=analyzeBuf(buffer);
        }
    }

}

std::ostream& operator<<(std::ostream& stream,const lex::TokenType &tokenType)
{
    lex::Token(tokenType).printType(stream);
    return stream;
}

std::ostream& operator<<(std::ostream& stream,const lex::Token& token)
{
    token.print(stream);
    return stream;
}

std::ostream& operator<<(std::ostream& stream,const lex::Sequence& sequence)
{
    sequence.print(stream);
    return stream;
}

lex::Sequence operator+(const lex::Sequence &seq1,const lex::Sequence &seq2)
{
    lex::Sequence seqr(seq1);
    seqr+=seq2;
    return seqr;
}

lex::Sequence operator+(const lex::Sequence &seq,const lex::Token &token)
{
    lex::Sequence seqr(seq);
    seqr+=token;
    return seqr;
}

lex::Sequence operator+(const lex::Token &token1,const lex::Token &token2)
{
    lex::Sequence seqr(token1);
    seqr+=token2;
    return seqr;
}