Shawn Presser

//====================================================================
// File:        XLex.h
// Author:      Shawn Presser
// Contact:     shawnpresser@gmail.com
// Date:        11-13-08
//
// Purpose:     To separate an ASCII .x file into a set of tokens.
//
// Copyright (c) 2008 Shawn Presser
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//====================================================================
#pragma once

// std c++ headers.
#include <list>

// forward declarations.
struct XToken;
enum ETokenType;

//====================================================================
// XLex
//====================================================================
class XLex
{
public:
    // lexes a .X file that resides in memory.
    XLex( const char* file, size_t fileSize );
    ~XLex();

    // returns the list of X file tokens.
    XToken*             GetTokens() const       {   return _head;       }

private:
    typedef std::list< XToken* >        TokenContainer;

    // generates one token.
    bool                Tokenize();

    // constructs a new token and finalizes any previous token.
    bool                AddToken( ETokenType type, unsigned int length, unsigned int advance );

    // allocates a new token.
    XToken*             AllocateToken();

    // searches for a character and returns its position (relative to
    // '_at').  If the end of the file is reached, the position of the
    // last character in the file is returned.
    unsigned int        Find( char c, const char* start ) const;

    // returns true if the pointer points to a float (rather than an int).
    bool                IsFloat( const char* pos ) const;

    // store a list of token arrays to be freed on destruction.
    TokenContainer      _tokenPools;

    // the current token pool from which to pull new tokens.
    XToken*             _tokenPool;
    unsigned int        _tokenPoolIdx;

    // the result token list.
    XToken*             _head;
    XToken*             _last;

    // the file data to lex.
    const char*         _fileStart;
    const char*         _fileEnd;
    const char*         _at;

    // the current line number.
    unsigned int        _lineNumber;

    // the number of result tokens.
    unsigned int        _tokenCount;
};

//====================================================================
// File:        XLex.cpp
// Author:      Shawn Presser
// Contact:     shawnpresser@gmail.com
// Date:        11-13-08
//
// Copyright (c) 2008 Shawn Presser
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//====================================================================

// module header.
#include "XLex.h"

// std c headers.
#include <ctype.h> // isdigit
#include <assert.h> // assert

// project headers.
#include "XToken.h"

// constants.
#define     TOKEN_POOL_SIZE         20480

//====================================================================
// XLex
//====================================================================

//--------------------------------------------------------------------
XLex::XLex( const char* file, size_t fileSize )
: _tokenPool( 0 )
, _tokenPoolIdx( 0 )
, _head( 0 )
, _last( 0 )
, _fileStart( file )
, _fileEnd( file + fileSize )
, _at( file )
, _lineNumber( 0 )
, _tokenCount( 0 )
{
    // allocate an initial token pool to pull new tokens from.
    _tokenPool = new XToken[ TOKEN_POOL_SIZE ];
    _tokenPools.push_back( _tokenPool );

    // only tokenize if the file actually has data.
    if( fileSize > 0 )
    {
        // tokenize the entire file.
        while ( Tokenize() )
        {
        }
    }
}

//--------------------------------------------------------------------
XLex::~XLex()
{
    // delete each token pool.
    for ( TokenContainer::iterator it = _tokenPools.begin(); it != _tokenPools.end(); ++it )
    {
        XToken* tokenPool = *it;
        delete [] tokenPool;
    }
}

//--------------------------------------------------------------------
bool
XLex::Tokenize()
{
    // if the current character is whitespace, just skip it.
    while ( isspace( *_at ) )
    {
        // if the current character is a newline, increment our
        // line number counter.
        if ( *_at == '\n' )
        {
            ++_lineNumber;
        }

        ++_at;

        // if we've reached the end of the file, stop.
        if ( _at >= _fileEnd )
        {
            return false;
        }
    }

    // if the current character is a negation sign (-), get a pointer to the
    // next character.
    const char* num = _at;
    if ( *num == '-' )
    {
        ++num;

        // if we've reached the end of the file, stop.
        if ( _at >= _fileEnd )
        {
            return false;
        }
    }

    // test whether the current character is a number.
    if ( isdigit( *num ) )
    {
        // if it is, then determine whether the number is a float.
        if ( IsFloat( num ) )
        {
            // find the end of the float by iterating until we run into
            // a non-digit, with the exception of a decimal point, which
            // we skip once.
            bool skipDecimal = true;
            const char* end;
            for ( end = num; end < _fileEnd; ++end )
            {
                if ( !isdigit( *end ) )
                {
                    // if the current character isn't a decimal, we've found
                    // the end of the float.
                    if ( *end != '.' )
                    {
                        break;
                    }

                    // otherwise, if we already skipped one decimal, then this
                    // second decimal is the end of the float.
                    if ( !skipDecimal )
                    {
                        break;
                    }

                    // the current character is the first decimal we've run into,
                    // so indicate that we shouldn't skip any more decimals.
                    skipDecimal = false;
                }
            }
            unsigned int length = end - _at;

            // create a float token.
            return AddToken( TT_FLOAT, length, length );
        }
        else
        {
            // find the end of the integer.
            const char* end;
            for ( end = num; end < _fileEnd; ++end )
            {
                if ( !isdigit( *end ) )
                {
                    break;
                }
            }
            unsigned int length = end - _at;

            // create an integer token.
            return AddToken( TT_INTEGER, length, length );
        }
    }

    // if the current character is one of the single-letter tokens,
    // then tokenize it.
    switch ( *_at )
    {
    case '{': return AddToken( TT_OPEN_BRACE, 1, 1 );
    case '}': return AddToken( TT_CLOSE_BRACE, 1, 1 );
    case '[': return AddToken( TT_OPEN_BRACKET, 1, 1 );
    case ']': return AddToken( TT_CLOSE_BRACKET, 1, 1 );
    case ',': return AddToken( TT_COMMA, 1, 1 );
    case ';': return AddToken( TT_SEMICOLON, 1, 1 );
    }

    // if the current character is a quote, then create a string token.
    if ( *_at == '"' )
    {
        // advance past the opening quote.
        ++_at;

        // find the position of the end quote.
        unsigned int length = Find( '"', _at );

        // create a string token, advancing the file pointer past the
        // close quote.
        return AddToken( TT_STRING, length, length+1 );
    }

    // if the current character is a <, then create a uuid token.
    if ( *_at == '<' )
    {
        // advance past the opening symbol.
        ++_at;

        // find the position of the closing symbol.
        unsigned int length = Find( '>', _at );

        // create a uuid token, advancing the file pointer one past the
        // closing symbol.
        return AddToken( TT_UUID, length, length+1 );
    }

    // if the current character is a letter or an underscore, parse an
    // identifier.
    if ( isalpha( *_at ) || *_at == '_' )
    {
        // find the end of the identifier by searching for the first
        // non-alpha-numeric character that also isn't an underscore.
        const char* end;
        for ( end = _at+1; end < _fileEnd; ++end )
        {
            if ( !isalpha( *end ) && !isdigit( *end ) && *end != '_' )
            {
                break;
            }
        }
        unsigned int length = end - _at;

        // create an identifier token..
        return AddToken( TT_IDENTIFIER, length, length );
    }

    // if the file pointer is pointing to a pound sign or a double forward
    // slash (//), then it is a line comment, so skip the rest of the line.
    if ( *_at == '#' || ( ( _fileEnd - _at ) >= 2 && ( _at[0] == '/' ) && ( _at[1] == '/' ) ) )
    {
        // advance the file pointer to the end of the line.
        while ( *_at != '\n' )
        {
            ++_at;

            // if we've reached the end of the file, stop.
            if ( _at >= _fileEnd )
            {
                return false;
            }
        }
    }

    // the character is not a number, open brace, close brace, comma, semicolon,
    // string, uuid, identifier, or comma; it must be invalid.  We could raise
    // an error condition, but we may as well just skip it.
    ++_at;

    // return whether the whole file has been lexed.
    return ( _at < _fileEnd );
}

//--------------------------------------------------------------------
bool
XLex::AddToken( ETokenType type, unsigned int length, unsigned int advance )
{
    // construct a new token.
    XToken* result = AllocateToken();
    result->type = type;
    result->start = _at;
    result->end = _at + length;
    result->next = 0;

    // if there was a previous token, then append the new token
    // to the token list.
    if ( _last )
    {
        _last->next = result;
    }

    // set the new token to be the latest token.
    _last = result;

    // if this is the first token to be created, then track it
    // as the 'head' token.
    if ( !_head )
    {
        _head = result;
    }

    // advance the file pointer.
    _at += advance;

    // increment our token count.
    ++_tokenCount;

    // return true if the file pointer hasn't reached the end,
    // otherwise return false.
    return ( _at < _fileEnd );
}

//--------------------------------------------------------------------
XToken*
XLex::AllocateToken()
{
    // sanity check the pool size.
    assert( TOKEN_POOL_SIZE > 0 );

    // validate that a token pool has been initially allocated.
    assert( _tokenPool );

    // if the token pool is full, allocate a new batch of tokens
    // to pull from.
    if ( _tokenPoolIdx >= TOKEN_POOL_SIZE )
    {
        _tokenPool = new XToken[ TOKEN_POOL_SIZE ];
        _tokenPools.push_back( _tokenPool );
        _tokenPoolIdx = 0;
    }

    // return an unused token.
    return &_tokenPool[ _tokenPoolIdx++ ];
}

//--------------------------------------------------------------------
unsigned int
XLex::Find( char c, const char* start ) const
{
    // find the first occurrence of the character.
    for ( const char* iter = start; iter != _fileEnd; ++iter )
    {
        if ( *iter == c )
        {
            // return its position relative to '_at'.
            return iter - _at;
        }
    }

    // if we've reached the end of the file, return the
    // position of the last valid character.
    return ( _fileEnd - 1 ) - _at;
}

//--------------------------------------------------------------------
bool
XLex::IsFloat( const char* pos ) const
{
    // if the current character is a negation sign, skip it.
    if ( *pos == '-' )
    {
        ++pos;
    }

    // search for a decimal point.
    for ( const char* iter = pos; iter < _fileEnd; ++iter )
    {
        char c = *iter;

        // if the current character is not a digit, test whether it's
        // a decimal.  If it is, the number is a float, so return true.
        // Otherwise, the number is not a float, so return false.
        if ( !isdigit( c ) )
        {
            return ( c == '.' );
        }
    }

    // if we've reached the end of the file without finding a decimal
    // but without finding a non-digit, then return false, because this
    // number is an integer.
    return false;
}