Advertisement
Guest User

Shawn Presser

a guest
Nov 14th, 2008
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 14.52 KB | None | 0 0
  1. //====================================================================
  2. // File:        XLex.h
  3. // Author:      Shawn Presser
  4. // Contact:     shawnpresser@gmail.com
  5. // Date:        11-13-08
  6. //
  7. // Purpose:     To separate an ASCII .x file into a set of tokens.
  8. //
  9. // Copyright (c) 2008 Shawn Presser
  10. //
  11. // Permission is hereby granted, free of charge, to any person
  12. // obtaining a copy of this software and associated documentation
  13. // files (the "Software"), to deal in the Software without
  14. // restriction, including without limitation the rights to use,
  15. // copy, modify, merge, publish, distribute, sublicense, and/or sell
  16. // copies of the Software, and to permit persons to whom the
  17. // Software is furnished to do so, subject to the following
  18. // conditions:
  19. //
  20. // The above copyright notice and this permission notice shall be
  21. // included in all copies or substantial portions of the Software.
  22. //
  23. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  25. // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  27. // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  28. // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  29. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  30. // OTHER DEALINGS IN THE SOFTWARE.
  31. //====================================================================
  32. #pragma once
  33.  
  34. // std c++ headers.
  35. #include <list>
  36.  
  37. // forward declarations.
  38. struct XToken;
  39. enum ETokenType;
  40.  
  41. //====================================================================
  42. // XLex
  43. //====================================================================
  44. class XLex
  45. {
  46. public:
  47.     // lexes a .X file that resides in memory.
  48.     XLex( const char* file, size_t fileSize );
  49.     ~XLex();
  50.  
  51.     // returns the list of X file tokens.
  52.     XToken*             GetTokens() const       {   return _head;       }
  53.  
  54. private:
  55.     typedef std::list< XToken* >        TokenContainer;
  56.  
  57.     // generates one token.
  58.     bool                Tokenize();
  59.  
  60.     // constructs a new token and finalizes any previous token.
  61.     bool                AddToken( ETokenType type, unsigned int length, unsigned int advance );
  62.  
  63.     // allocates a new token.
  64.     XToken*             AllocateToken();
  65.  
  66.     // searches for a character and returns its position (relative to
  67.     // '_at').  If the end of the file is reached, the position of the
  68.     // last character in the file is returned.
  69.     unsigned int        Find( char c, const char* start ) const;
  70.  
  71.     // returns true if the pointer points to a float (rather than an int).
  72.     bool                IsFloat( const char* pos ) const;
  73.  
  74.     // store a list of token arrays to be freed on destruction.
  75.     TokenContainer      _tokenPools;
  76.  
  77.     // the current token pool from which to pull new tokens.
  78.     XToken*             _tokenPool;
  79.     unsigned int        _tokenPoolIdx;
  80.  
  81.     // the result token list.
  82.     XToken*             _head;
  83.     XToken*             _last;
  84.  
  85.     // the file data to lex.
  86.     const char*         _fileStart;
  87.     const char*         _fileEnd;
  88.     const char*         _at;
  89.  
  90.     // the current line number.
  91.     unsigned int        _lineNumber;
  92.  
  93.     // the number of result tokens.
  94.     unsigned int        _tokenCount;
  95. };
  96.  
  97. //====================================================================
  98. // File:        XLex.cpp
  99. // Author:      Shawn Presser
  100. // Contact:     shawnpresser@gmail.com
  101. // Date:        11-13-08
  102. //
  103. // Copyright (c) 2008 Shawn Presser
  104. //
  105. // Permission is hereby granted, free of charge, to any person
  106. // obtaining a copy of this software and associated documentation
  107. // files (the "Software"), to deal in the Software without
  108. // restriction, including without limitation the rights to use,
  109. // copy, modify, merge, publish, distribute, sublicense, and/or sell
  110. // copies of the Software, and to permit persons to whom the
  111. // Software is furnished to do so, subject to the following
  112. // conditions:
  113. //
  114. // The above copyright notice and this permission notice shall be
  115. // included in all copies or substantial portions of the Software.
  116. //
  117. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  118. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  119. // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  120. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  121. // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  122. // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  123. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  124. // OTHER DEALINGS IN THE SOFTWARE.
  125. //====================================================================
  126.  
  127. // module header.
  128. #include "XLex.h"
  129.  
  130. // std c headers.
  131. #include <ctype.h> // isdigit
  132. #include <assert.h> // assert
  133.  
  134. // project headers.
  135. #include "XToken.h"
  136.  
  137. // constants.
  138. #define     TOKEN_POOL_SIZE         20480
  139.  
  140. //====================================================================
  141. // XLex
  142. //====================================================================
  143.  
  144. //--------------------------------------------------------------------
  145. XLex::XLex( const char* file, size_t fileSize )
  146. : _tokenPool( 0 )
  147. , _tokenPoolIdx( 0 )
  148. , _head( 0 )
  149. , _last( 0 )
  150. , _fileStart( file )
  151. , _fileEnd( file + fileSize )
  152. , _at( file )
  153. , _lineNumber( 0 )
  154. , _tokenCount( 0 )
  155. {
  156.     // allocate an initial token pool to pull new tokens from.
  157.     _tokenPool = new XToken[ TOKEN_POOL_SIZE ];
  158.     _tokenPools.push_back( _tokenPool );
  159.  
  160.     // only tokenize if the file actually has data.
  161.     if( fileSize > 0 )
  162.     {
  163.         // tokenize the entire file.
  164.         while ( Tokenize() )
  165.         {
  166.         }
  167.     }
  168. }
  169.  
  170. //--------------------------------------------------------------------
  171. XLex::~XLex()
  172. {
  173.     // delete each token pool.
  174.     for ( TokenContainer::iterator it = _tokenPools.begin(); it != _tokenPools.end(); ++it )
  175.     {
  176.         XToken* tokenPool = *it;
  177.         delete [] tokenPool;
  178.     }
  179. }
  180.  
  181. //--------------------------------------------------------------------
  182. bool
  183. XLex::Tokenize()
  184. {
  185.     // if the current character is whitespace, just skip it.
  186.     while ( isspace( *_at ) )
  187.     {
  188.         // if the current character is a newline, increment our
  189.         // line number counter.
  190.         if ( *_at == '\n' )
  191.         {
  192.             ++_lineNumber;
  193.         }
  194.  
  195.         ++_at;
  196.  
  197.         // if we've reached the end of the file, stop.
  198.         if ( _at >= _fileEnd )
  199.         {
  200.             return false;
  201.         }
  202.     }
  203.  
  204.     // if the current character is a negation sign (-), get a pointer to the
  205.     // next character.
  206.     const char* num = _at;
  207.     if ( *num == '-' )
  208.     {
  209.         ++num;
  210.  
  211.         // if we've reached the end of the file, stop.
  212.         if ( _at >= _fileEnd )
  213.         {
  214.             return false;
  215.         }
  216.     }
  217.  
  218.     // test whether the current character is a number.
  219.     if ( isdigit( *num ) )
  220.     {
  221.         // if it is, then determine whether the number is a float.
  222.         if ( IsFloat( num ) )
  223.         {
  224.             // find the end of the float by iterating until we run into
  225.             // a non-digit, with the exception of a decimal point, which
  226.             // we skip once.
  227.             bool skipDecimal = true;
  228.             const char* end;
  229.             for ( end = num; end < _fileEnd; ++end )
  230.             {
  231.                 if ( !isdigit( *end ) )
  232.                 {
  233.                     // if the current character isn't a decimal, we've found
  234.                     // the end of the float.
  235.                     if ( *end != '.' )
  236.                     {
  237.                         break;
  238.                     }
  239.  
  240.                     // otherwise, if we already skipped one decimal, then this
  241.                     // second decimal is the end of the float.
  242.                     if ( !skipDecimal )
  243.                     {
  244.                         break;
  245.                     }
  246.  
  247.                     // the current character is the first decimal we've run into,
  248.                     // so indicate that we shouldn't skip any more decimals.
  249.                     skipDecimal = false;
  250.                 }
  251.             }
  252.             unsigned int length = end - _at;
  253.  
  254.             // create a float token.
  255.             return AddToken( TT_FLOAT, length, length );
  256.         }
  257.         else
  258.         {
  259.             // find the end of the integer.
  260.             const char* end;
  261.             for ( end = num; end < _fileEnd; ++end )
  262.             {
  263.                 if ( !isdigit( *end ) )
  264.                 {
  265.                     break;
  266.                 }
  267.             }
  268.             unsigned int length = end - _at;
  269.  
  270.             // create an integer token.
  271.             return AddToken( TT_INTEGER, length, length );
  272.         }
  273.     }
  274.  
  275.     // if the current character is one of the single-letter tokens,
  276.     // then tokenize it.
  277.     switch ( *_at )
  278.     {
  279.     case '{': return AddToken( TT_OPEN_BRACE, 1, 1 );
  280.     case '}': return AddToken( TT_CLOSE_BRACE, 1, 1 );
  281.     case '[': return AddToken( TT_OPEN_BRACKET, 1, 1 );
  282.     case ']': return AddToken( TT_CLOSE_BRACKET, 1, 1 );
  283.     case ',': return AddToken( TT_COMMA, 1, 1 );
  284.     case ';': return AddToken( TT_SEMICOLON, 1, 1 );
  285.     }
  286.  
  287.     // if the current character is a quote, then create a string token.
  288.     if ( *_at == '"' )
  289.     {
  290.         // advance past the opening quote.
  291.         ++_at;
  292.  
  293.         // find the position of the end quote.
  294.         unsigned int length = Find( '"', _at );
  295.  
  296.         // create a string token, advancing the file pointer past the
  297.         // close quote.
  298.         return AddToken( TT_STRING, length, length+1 );
  299.     }
  300.  
  301.     // if the current character is a <, then create a uuid token.
  302.     if ( *_at == '<' )
  303.     {
  304.         // advance past the opening symbol.
  305.         ++_at;
  306.  
  307.         // find the position of the closing symbol.
  308.         unsigned int length = Find( '>', _at );
  309.  
  310.         // create a uuid token, advancing the file pointer one past the
  311.         // closing symbol.
  312.         return AddToken( TT_UUID, length, length+1 );
  313.     }
  314.  
  315.     // if the current character is a letter or an underscore, parse an
  316.     // identifier.
  317.     if ( isalpha( *_at ) || *_at == '_' )
  318.     {
  319.         // find the end of the identifier by searching for the first
  320.         // non-alpha-numeric character that also isn't an underscore.
  321.         const char* end;
  322.         for ( end = _at+1; end < _fileEnd; ++end )
  323.         {
  324.             if ( !isalpha( *end ) && !isdigit( *end ) && *end != '_' )
  325.             {
  326.                 break;
  327.             }
  328.         }
  329.         unsigned int length = end - _at;
  330.  
  331.         // create an identifier token..
  332.         return AddToken( TT_IDENTIFIER, length, length );
  333.     }
  334.  
  335.     // if the file pointer is pointing to a pound sign or a double forward
  336.     // slash (//), then it is a line comment, so skip the rest of the line.
  337.     if ( *_at == '#' || ( ( _fileEnd - _at ) >= 2 && ( _at[0] == '/' ) && ( _at[1] == '/' ) ) )
  338.     {
  339.         // advance the file pointer to the end of the line.
  340.         while ( *_at != '\n' )
  341.         {
  342.             ++_at;
  343.  
  344.             // if we've reached the end of the file, stop.
  345.             if ( _at >= _fileEnd )
  346.             {
  347.                 return false;
  348.             }
  349.         }
  350.     }
  351.  
  352.     // the character is not a number, open brace, close brace, comma, semicolon,
  353.     // string, uuid, identifier, or comma; it must be invalid.  We could raise
  354.     // an error condition, but we may as well just skip it.
  355.     ++_at;
  356.  
  357.     // return whether the whole file has been lexed.
  358.     return ( _at < _fileEnd );
  359. }
  360.  
  361. //--------------------------------------------------------------------
  362. bool
  363. XLex::AddToken( ETokenType type, unsigned int length, unsigned int advance )
  364. {
  365.     // construct a new token.
  366.     XToken* result = AllocateToken();
  367.     result->type = type;
  368.     result->start = _at;
  369.     result->end = _at + length;
  370.     result->next = 0;
  371.  
  372.     // if there was a previous token, then append the new token
  373.     // to the token list.
  374.     if ( _last )
  375.     {
  376.         _last->next = result;
  377.     }
  378.  
  379.     // set the new token to be the latest token.
  380.     _last = result;
  381.  
  382.     // if this is the first token to be created, then track it
  383.     // as the 'head' token.
  384.     if ( !_head )
  385.     {
  386.         _head = result;
  387.     }
  388.  
  389.     // advance the file pointer.
  390.     _at += advance;
  391.  
  392.     // increment our token count.
  393.     ++_tokenCount;
  394.  
  395.     // return true if the file pointer hasn't reached the end,
  396.     // otherwise return false.
  397.     return ( _at < _fileEnd );
  398. }
  399.  
  400. //--------------------------------------------------------------------
  401. XToken*
  402. XLex::AllocateToken()
  403. {
  404.     // sanity check the pool size.
  405.     assert( TOKEN_POOL_SIZE > 0 );
  406.  
  407.     // validate that a token pool has been initially allocated.
  408.     assert( _tokenPool );
  409.  
  410.     // if the token pool is full, allocate a new batch of tokens
  411.     // to pull from.
  412.     if ( _tokenPoolIdx >= TOKEN_POOL_SIZE )
  413.     {
  414.         _tokenPool = new XToken[ TOKEN_POOL_SIZE ];
  415.         _tokenPools.push_back( _tokenPool );
  416.         _tokenPoolIdx = 0;
  417.     }
  418.  
  419.     // return an unused token.
  420.     return &_tokenPool[ _tokenPoolIdx++ ];
  421. }
  422.  
  423. //--------------------------------------------------------------------
  424. unsigned int
  425. XLex::Find( char c, const char* start ) const
  426. {
  427.     // find the first occurrence of the character.
  428.     for ( const char* iter = start; iter != _fileEnd; ++iter )
  429.     {
  430.         if ( *iter == c )
  431.         {
  432.             // return its position relative to '_at'.
  433.             return iter - _at;
  434.         }
  435.     }
  436.  
  437.     // if we've reached the end of the file, return the
  438.     // position of the last valid character.
  439.     return ( _fileEnd - 1 ) - _at;
  440. }
  441.  
  442. //--------------------------------------------------------------------
  443. bool
  444. XLex::IsFloat( const char* pos ) const
  445. {
  446.     // if the current character is a negation sign, skip it.
  447.     if ( *pos == '-' )
  448.     {
  449.         ++pos;
  450.     }
  451.  
  452.     // search for a decimal point.
  453.     for ( const char* iter = pos; iter < _fileEnd; ++iter )
  454.     {
  455.         char c = *iter;
  456.  
  457.         // if the current character is not a digit, test whether it's
  458.         // a decimal.  If it is, the number is a float, so return true.
  459.         // Otherwise, the number is not a float, so return false.
  460.         if ( !isdigit( c ) )
  461.         {
  462.             return ( c == '.' );
  463.         }
  464.     }
  465.  
  466.     // if we've reached the end of the file without finding a decimal
  467.     // but without finding a non-digit, then return false, because this
  468.     // number is an integer.
  469.     return false;
  470. }
  471.  
  472.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement