CSS Tokeniser

/** Parses a block of CSS code and returns it as a serialised object. Supports even unorthodox use such as nested rulesets and "junk" declarations. Comments are left intact. */
function tokenise(e){var t=[],e=undefined==="a"[0]?e.split(""):e,n=/^\s+|\s+$/g,r,i=0,s,o,u,a=t;for(;;){r=e[i];++i;if(undefined===r){if(s){if(1===s.type){a.push(s);if(s.P){u=s;s=s.P;delete u.P}else s=null}else{o={2:"[",4:"(",8:'"',16:"'"};while(30&s.type)if(s.P){u=s;s=s.P;console.log(u.data,u.type);s.data+=o[u.type]+u.data}}if(s){if(32===s.type){if(s.data){if(o=s.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))s.push({name:o[1],value:o[2].replace(n,"")});else if(s.data=s.data.replace(n,""))s.push(s.data)}}while(s.P){u=s;a=s=u.P;delete u.P;delete u.data}t.push(s)}s=null;a=t;u=null}break}if(s){if(1===s.type){if("*"===r&&"/"===e[i]){++i;a.push(s);if(s.P){u=s;s=s.P;delete u.P}else s=null}else s.data+=r}else if(2===s.type){if("]"===r){if(s.P){u=s;s=s.P;s.data+="["+u.data+"]";u=null}else{s.data="["+s.data+"]";delete s.type}}else if('"'===r)s={type:8,P:s,data:""};else if("'"===r)s={type:16,P:s,data:""};else if("["===r){s={type:2,P:s,data:""}}else if("/"===r&&"*"===e[i]){++i;u=s;s={type:1,data:"",P:u};if((u.data||u.textBefore)&&(u=((u.textBefore||"")+u.data).replace(/^\s+/g,"")))s.textBefore=u;u=null}else s.data+=r}else if(4===s.type){if(")"===r){if(s.P){u=s;s=s.P;s.data+="("+u.data+")";u=null}else{s.data="("+s.data+")";delete s.type}}else if('"'===r)s={type:8,P:s,data:""};else if("'"===r)s={type:16,P:s,data:""};else if("["===r){s={type:2,P:s,data:""}}else if("/"===r&&"*"===e[i]){++i;u=s;s={type:1,data:"",P:u};if((u.data||u.textBefore)&&(u=((u.textBefore||"")+u.data).replace(/^\s+/g,"")))s.textBefore=u;u=null}else s.data+=r}else if(8===s.type){if('"'===r&&!("\\"===e[i-2]&&false===/"(?:[^\\"]|\\.)*"/g.test('"'+s.data+'"'))){if(s.P){u=s;s=s.P;s.data+='"'+u.data+'"';u=null}else{s.data='"'+s.data+'"';delete s.type}}else s.data+=r}else if(16===s.type){if("'"===r&&!("\\"===e[i-2]&&false===/'(?:[^\\']|\\.)*'/g.test("'"+s.data+"'"))){if(s.P){u=s;s=s.P;s.data+="'"+u.data+"'";u=null}else{s.data="'"+s.data+"'";delete s.type}}else s.data+=r}else{if("["===r){s={type:2,P:s,textBefore:s.data+"[",data:""}}else if("("===r){s={type:4,P:s,textBefore:s.data+"(",data:""}}else if('"'===r){s={type:8,P:s,textBefore:s.data+'"',data:""}}else if("'"===r){s={type:16,P:s,textBefore:s.data+"'",data:""}}else if("{"===r){a=[];a.type=32;a.data="";a.name=s.data.replace(n,"");s.data="";if(s instanceof Array){s.push(a);a.P=s}else a.P=s.P;s=a}else if(";"===r){if(o=s.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))s.push({name:o[1],value:o[2]});else(s instanceof Array?s:a).push(s.data.replace(/^\s+/g,"")+";");s.data=""}else if("}"===r){if(s.data){if(o=s.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))s.push({name:o[1],value:o[2].replace(n,"")});else{if(s.data=s.data.replace(n,""))s.push(s.data)}}u=s;if(s.P){a=s=u.P}else{t.push(s);s=null;a=t}delete u.P;delete u.data;u=null}else if("/"===r&&"*"===e[i]){++i;u=s;s={type:1,data:"",P:u};if(u.data&&(u=u.data.replace(/^\s+/g,"")))s.textBefore=u;u=null}else{s.data+=r}}}else{if("    "===r||"\n"===r||" "===r)continue;if("/"===r&&"*"===e[i]){++i;s={type:1,data:""}}else if("{"===r){a=s=[];s.type=32;s.data=s.name=""}else s={data:r}}}return t};


/** Uncompressed version of above function so y'all can see what the hell's going on. */
function tokenise(string){

    /** Returned CSS array. */
    var CSS =   [],

    /** Run a quick-and-dirty hack for browsers that don't support direct character access in strings (thanks, IE7) */
        string  =   (undefined === "a"[0] ? string.split("") : string),

    /** Token type constants */
        T_COMMENT           =   1,
        T_BRACKETS_SQUARE   =   2,
        T_BRACKETS_ROUND    =   4,
        T_QUOTES_DOUBLE     =   8,
        T_QUOTES_SINGLE     =   16,
        T_BLOCK             =   32,

        /* Matches any bracket or quote-like token type. */
        T_DELIMITER         =   30,


    /** RegExp for stripping leading/trailing whitespace. */
        rTrim   =   /^\s+|\s+$/g,


    /** Iterator variables */
        char, index =   0, token,


    /** Junk variables: used for juggling data within the loop. May be overwritten for whatever. */
        prop, prev,


    /**
     *  Pointer to the last block that was opened in the token stack. Used for dumping injected comments that were found between parsable tokens.
     *  Note that injected comments retain a copy of the leading character data (minus whitespace) so developers can use any injected commentary
     *  to supply "custom properties" or metadata to their scripts.
     */
        dumpTo = CSS;


    for(;;){
        char    =   string[index];
        ++index;


        /** EOT? Bail. */
        if(undefined === char){

            /** We've still got a token hanging open, which means some idiot developer's forgotten to close a bracket or something. */
            if(token){

                /** Unclosed comment */
                if(T_COMMENT === token.type){
                    dumpTo.push(token);
                    if(token.parent){
                        prev    =   token;
                        token   =   token.parent;
                        delete prev.parent;
                    }
                    else token  =   null;
                }


                /** Anything else that we were collecting that's supposed to be serialised into a string. */
                else{

                    /**
                     * Use an object literal for retrieving the leading delimiter characters inside
                     * the following while loop. Saves us running four different checks per cycle.
                     */
                    prop    =   {
                        2:      "[",    //  T_BRACKETS_SQUARE
                        4:      "(",    //  T_BRACKETS_ROUND
                        8:      '"',    //  T_QUOTES_DOUBLE
                        16:     "'"     //  T_QUOTES_SINGLE
                    };

                    while(T_DELIMITER & token.type)
                        if(token.parent){
                            prev        =   token;
                            token       =   token.parent;
                            console.log(prev.data, prev.type);
                            token.data  +=  prop[prev.type] + prev.data;
                        }
                }

                /** Make sure we're not operating on a token that's been emptied from top-level (e.g., unclosed comment block at top-level) */
                if(token){

                    /** Unclosed block */
                    if(T_BLOCK === token.type){

                        /** This token's still carrying unassigned data. */
                        if(token.data){
                            if(prop = token.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))
                                token.push({
                                    name:   prop[1],
                                    value:  prop[2].replace(rTrim, "")
                                });

                            /** Junk (that isn't whitespace) */
                            else if(token.data = token.data.replace(rTrim, ""))
                                token.push(token.data);
                        }
                    }


                    /** Right. Now wrap it up. */
                    while(token.parent){
                        prev    =   token;
                        dumpTo  =
                        token   =   prev.parent;

                        delete prev.parent;
                        delete prev.data;
                    }

                    CSS.push(token);
                }

                token   =   null;
                dumpTo  =   CSS;
                prev    =   null;
            }
            break;
        }


        /** We've currently picked up a token. */
        if(token){


            /** Comment */
            if(T_COMMENT === token.type){

                /** End of comment. */
                if("*" === char && "/" === string[index]){
                    ++index;

                    /** Because comments are free to be inserted virtually anywhere in CSS, we need to use a special variable
                     for appending them (since .parent may point to a string-only token like brackets). */
                    dumpTo.push(token);

                    /** Comment somewhere inside a block */
                    if(token.parent){
                        prev    =   token;
                        token   =   token.parent;
                        delete prev.parent;
                    }

                    /** This was a comment at top-level, so don't store any back-references. */
                    else token  =   null;
                }

                else token.data += char;
            }


            /** [Square brackets] */
            else if(T_BRACKETS_SQUARE === token.type){

                if("]" === char){
                    if(token.parent){
                        prev    =   token;          // Store a reference to the current token so we can append the data after switching.
                        token   =   token.parent;   // Move the focus back to the token's parent.
                        token.data  +=  "[" + prev.data + "]";
                        prev    =   null;
                    }

                    /** This could have only been picked up with a selector like "[hidden]" or something without leading word characters. */
                    else{
                        token.data  =   "[" + token.data + "]";
                        delete token.type;
                    }
                }

                /** Watch out for quotes. */
                else if('"' === char)   token   =   {type:  T_QUOTES_DOUBLE,    parent: token,  data:   ""};
                else if("'" === char)   token   =   {type:  T_QUOTES_SINGLE,    parent: token,  data:   ""};


                /** Look out for nesting, too. */
                else if("[" === char){
                    token   =   {
                        type:   T_BRACKETS_SQUARE,
                        parent: token,
                        data:   ""
                    };
                }


                /** Start of an injected comment */
                else if("/" === char && "*" === string[index]){
                    ++index;
                    prev            =   token;
                    token           =   {
                        type:   T_COMMENT,
                        data:   "",
                        parent: prev
                    };

                    if((prev.data || prev.textBefore) && (prev = ((prev.textBefore || "") + prev.data).replace(/^\s+/g, "")))
                        token.textBefore    =   prev;
                    prev            =   null;
                }

                else token.data +=  char;
            }


            /** (Round brackets) */
            else if(T_BRACKETS_ROUND === token.type){

                /** Exact same procedure with square brackets. Note that we're duplicating our code block to avoid carrying a few extra variables around in memory. */
                if(")" === char){
                    if(token.parent){
                        prev    =   token;
                        token   =   token.parent;
                        token.data  +=  "(" + prev.data + ")";
                        prev    =   null;
                    }

                    /** Absolutely no idea how this could've happened. Something like "(whatever)" as a selector is meaningless. Whatever. */
                    else{
                        token.data  =   "(" + token.data + ")";
                        delete token.type;
                    }
                }

                /** Watch out for quotes. */
                else if('"' === char)   token   =   {type:  T_QUOTES_DOUBLE,    parent: token,  data:   ""};
                else if("'" === char)   token   =   {type:  T_QUOTES_SINGLE,    parent: token,  data:   ""};


                /** Look out for nesting, too. */
                else if("[" === char){
                    token   =   {
                        type:   T_BRACKETS_SQUARE,
                        parent: token,
                        data:   ""
                    };
                }


                /** Start of an injected comment */
                else if("/" === char && "*" === string[index]){
                    ++index;
                    prev            =   token;
                    token           =   {
                        type:   T_COMMENT,
                        data:   "",
                        parent: prev
                    };
                    if((prev.data || prev.textBefore) && (prev = ((prev.textBefore || "") + prev.data).replace(/^\s+/g, "")))
                        token.textBefore    =   prev;
                    prev            =   null;
                }

                else token.data +=  char;
            }


            /** "Double quotes" */
            else if(T_QUOTES_DOUBLE === token.type){

                /** End of quote. */
                if('"' === char && !("\\" === string[index-2] && false === /"(?:[^\\"]|\\.)*"/g.test('"'+token.data+'"'))){

                    if(token.parent){
                        prev    =   token;
                        token   =   token.parent;
                        token.data  +=  '"' + prev.data + '"';
                        prev    =   null;
                    }

                    /** Not entirely sure how this happened... */
                    else{
                        token.data  =   '"' + token.data + '"';
                        delete token.type;
                    }
                }

                else token.data += char;
            }


            /** 'Single quotes' */
            else if(T_QUOTES_SINGLE === token.type){

                /** End of quote. */
                if("'" === char && !("\\" === string[index-2] && false === /'(?:[^\\']|\\.)*'/g.test("'"+token.data+"'"))){

                    if(token.parent){
                        prev    =   token;
                        token   =   token.parent;
                        token.data  +=  "'" + prev.data + "'";
                        prev    =   null;
                    }

                    /** Not entirely sure how this happened... */
                    else{
                        token.data  =   "'" + token.data + "'";
                        delete token.type;
                    }
                }

                else token.data += char;
            }


            /** No token type currently assigned. */
            else{
                /** [Square brackets] */
                if("[" === char){
                    token   =   {
                        type:       T_BRACKETS_SQUARE,
                        parent:     token,
                        textBefore: token.data + "[",
                        data:       ""
                    };
                }

                /** (Round brackets) */
                else if("(" === char){
                    token   =   {
                        type:   T_BRACKETS_ROUND,
                        parent: token,
                        textBefore: token.data + "(",
                        data:   ""
                    };
                }

                /** "Double "quotes" */
                else if('"' === char){
                    token   =   {
                        type:   T_QUOTES_DOUBLE,
                        parent: token,
                        textBefore: token.data + '"',
                        data:   ""
                    };
                }

                /** 'Single quotes' */
                else if("'" === char){
                    token   =   {
                        type:   T_QUOTES_SINGLE,
                        parent: token,
                        textBefore: token.data + "'",
                        data:   ""
                    };
                }


                /** Block */
                else if("{" === char){

                    /** Since we need to assign our dumpTo variable to the newly created block anyway,
                     * we may as well commendere the variable for creating it from our existing token. */
                    dumpTo          =   [];
                    dumpTo.type     =   T_BLOCK;
                    dumpTo.data     =   "";
                    dumpTo.name     =   token.data.replace(rTrim, "");

                    token.data      =   "";
                    if(token instanceof Array){
                        token.push(dumpTo);
                        dumpTo.parent   =   token;
                    }

                    else dumpTo.parent  =   token.parent;

                    /** Switch focus to our newly-created block token. */
                    token   =   dumpTo;
                }


                /** Semicolon: end of property declaration? */
                else if(";" === char){

                    /** Break the previous token apart by the first colon, and assign it as a new property value. */
                    if(prop = token.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))
                        token.push({
                            name:   prop[1],
                            value:  prop[2]
                        });

                    /** If no colon was found, then this is a meaningless declaration. Store it anyway as junk. */
                    else (token instanceof Array ? token : dumpTo).push(token.data.replace(/^\s+/g, "") + ";");

                    token.data  =   "";
                }


                /** End of block */
                else if("}" === char){

                    /** Since the last property declaration in a block may omit a trailing semicolon, check for any collected data first. */
                    if(token.data){
                        if(prop = token.data.match(/\s*([^:]+):\s*([^\x00]+)\s*/m))
                            token.push({
                                name:   prop[1],
                                value:  prop[2].replace(rTrim, "")
                            });

                        /** Junk, then. Stow it anyway unless it's whitespace. */
                        else{
                            if(token.data = token.data.replace(rTrim, ""))
                                token.push(token.data);
                        }
                    }


                    /** Store a reference to the current token. */
                    prev    =   token;

                    /** The current token's nested inside another token. */
                    if(token.parent){
                        dumpTo  =
                        token   =   prev.parent;
                    }

                    /** Token's at top-level, so push it onto the end of our returned CSS array. */
                    else{
                        CSS.push(token);
                        token   =   null;
                        dumpTo  =   CSS;
                    }

                    /** Free up some memory by losing some properties we no longer need. */
                    delete prev.parent;
                    delete prev.data;
                    prev    =   null;
                }


                /** Nested comment */
                else if("/" === char && "*" === string[index]){
                    ++index;
                    prev            =   token;
                    token           =   {
                        type:   T_COMMENT,
                        data:   "",
                        parent: prev
                    };

                    if(prev.data && (prev = prev.data.replace(/^\s+/g, "")))
                        token.textBefore    =   prev;
                    prev            =   null;
                }

                /** Still no known token type. */
                else{
                    token.data  +=  char;
                }
            }
        }


        /** No token currently being carried yet, which means we're cruising at top-level. */
        else{

            /** Whitespace? Ignore. */
            if("\t" === char || "\n" === char || " " === char) continue;

            /** Comment */
            if("/" === char && "*" === string[index]){
                ++index;
                token   =   {
                    type:   T_COMMENT,
                    data:   ""
                };
            }

            /** Selector-less block? WTF?!
             * This shouldn't ever happen unless a weird coder's decided to put { ... } in the middle of nowhere. */
            else if("{" === char){
                dumpTo  =
                token   =   [];
                token.type      =   T_BLOCK;
                token.data      =
                token.name      =   "";
            }

            /** Something else? */
            else token  =   {data: char};
        }
    }

    return CSS;
}