Advertisement
Guest User

Untitled

a guest
Jan 20th, 2018
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 18.31 KB | None | 0 0
  1. #include <iostream>
  2. #include <iomanip>
  3. #include <cstdlib>
  4. #include <string.h>
  5. #include <cassert>
  6. #include <string>
  7.  
  8. using std::cout;
  9. using std::cin;
  10. using std::cerr;
  11. using std::setw;
  12. using std::flush;
  13. using std::endl;
  14.  
  15. //
  16. // When a new tree is added to the table, we step
  17. // through all the currently defined suffixes from
  18. // the active point to the end point.  This structure
  19. // defines a Suffix by its final character.
  20. // In the canonical representation, we define that last
  21. // character by starting at a node in the tree, and
  22. // following a string of characters, represented by
  23. // first_char_index and last_char_index.  The two indices
  24. // point into the input string.  Note that if a suffix
  25. // ends at a node, there are no additional characters
  26. // needed to characterize its last character position.
  27. // When this is the case, we say the node is Explicit,
  28. // and set first_char_index > last_char_index to flag
  29. // that.
  30. //
  31.  
  32. class Suffix {
  33.     public :
  34.         int origin_node;
  35.         int first_char_index;
  36.         int last_char_index;
  37.         Suffix( int node, int start, int stop )
  38.             : origin_node( node ),
  39.               first_char_index( start ),
  40.               last_char_index( stop ){};
  41.         int Explicit(){ return first_char_index > last_char_index; }
  42.         int Implicit(){ return last_char_index >= first_char_index; }
  43.         void Canonize();
  44. };
  45.  
  46. //
  47. // The suffix tree is made up of edges connecting nodes.
  48. // Each edge represents a string of characters starting
  49. // at first_char_index and ending at last_char_index.
  50. // Edges can be inserted and removed from a hash table,
  51. // based on the Hash() function defined here.  The hash
  52. // table indicates an unused slot by setting the
  53. // start_node value to -1.
  54. //
  55.  
  56. class Edge {
  57.     public :
  58.         int first_char_index;
  59.         int last_char_index;
  60.         int end_node;
  61.         int start_node;
  62.         void Insert();
  63.         void Remove();
  64.         Edge();
  65.         Edge( int init_first_char_index,
  66.               int init_last_char_index,
  67.               int parent_node );
  68.         int SplitEdge( Suffix &s );
  69.         static Edge Find( int node, int c );
  70.         static int Hash( int node, int c );
  71. };
  72.  
  73. //
  74. //  The only information contained in a node is the
  75. //  suffix link. Each suffix in the tree that ends
  76. //  at a particular node can find the next smaller suffix
  77. //  by following the suffix_node link to a new node.  Nodes
  78. //  are stored in a simple array.
  79. //
  80. class Node {
  81.     public :
  82.         int suffix_node;
  83.         Node() { suffix_node = -1; }
  84.         static int Count;
  85. };
  86.  
  87. //
  88. // The maximum input string length this program
  89. // will handle is defined here.  A suffix tree
  90. // can have as many as 2N edges/nodes.  The edges
  91. // are stored in a hash table, whose size is also
  92. // defined here.
  93. //
  94. const int MAX_LENGTH = 1000;
  95. const int HASH_TABLE_SIZE = 2179;  //A prime roughly 10% larger
  96.  
  97. //
  98. // This is the hash table where all the currently
  99. // defined edges are stored.  You can dump out
  100. // all the currently defined edges by iterating
  101. // through the table and finding edges whose start_node
  102. // is not -1.
  103. //
  104.  
  105. Edge Edges[ HASH_TABLE_SIZE ];
  106.  
  107. //
  108. // The array of defined nodes.  The count is 1 at the
  109. // start because the initial tree has the root node
  110. // defined, with no children.
  111. //
  112.  
  113. int Node::Count = 1;
  114. Node Nodes[ MAX_LENGTH * 2 ];
  115.  
  116. //
  117. // The input buffer and character count.  Please note that N
  118. // is the length of the input string -1, which means it
  119. // denotes the maximum index in the input buffer.
  120. //
  121.  
  122. char T[ MAX_LENGTH ];
  123. int N;
  124.  
  125. //
  126. // Necessary forward references
  127. //
  128. void validate();
  129. int walk_tree( int start_node, int last_char_so_far );
  130.  
  131.  
  132. //
  133. // The default ctor for Edge just sets start_node
  134. // to the invalid value.  This is done to guarantee
  135. // that the hash table is initially filled with unused
  136. // edges.
  137. //
  138.  
  139. Edge::Edge()
  140. {
  141.     start_node = -1;
  142. }
  143.  
  144. //
  145. // I create new edges in the program while walking up
  146. // the set of suffixes from the active point to the
  147. // endpoint.  Each time I create a new edge, I also
  148. // add a new node for its end point.  The node entry
  149. // is already present in the Nodes[] array, and its
  150. // suffix node is set to -1 by the default Node() ctor,
  151. // so I don't have to do anything with it at this point.
  152. //
  153.  
  154. Edge::Edge( int init_first, int init_last, int parent_node )
  155. {
  156.     first_char_index = init_first;
  157.     last_char_index = init_last;
  158.     start_node = parent_node;
  159.     end_node = Node::Count++;
  160. }
  161.  
  162. //
  163. // Edges are inserted into the hash table using this hashing
  164. // function.
  165. //
  166.  
  167. int Edge::Hash( int node, int c )
  168. {
  169.     return ( ( node << 8 ) + c ) % HASH_TABLE_SIZE;
  170. }
  171.  
  172. //
  173. // A given edge gets a copy of itself inserted into the table
  174. // with this function.  It uses a linear probe technique, which
  175. // means in the case of a collision, we just step forward through
  176. // the table until we find the first unused slot.
  177. //
  178.  
  179. void Edge::Insert()
  180. {
  181.     int i = Hash( start_node, T[ first_char_index ] );
  182.     while ( Edges[ i ].start_node != -1 )
  183.         i = ++i % HASH_TABLE_SIZE;
  184.     Edges[ i ] = *this;
  185. }
  186.  
  187. //
  188. // Removing an edge from the hash table is a little more tricky.
  189. // You have to worry about creating a gap in the table that will
  190. // make it impossible to find other entries that have been inserted
  191. // using a probe.  Working around this means that after setting
  192. // an edge to be unused, we have to walk ahead in the table,
  193. // filling in gaps until all the elements can be found.
  194. //
  195. // Knuth, Sorting and Searching, Algorithm R, p. 527
  196. //
  197.  
  198. void Edge::Remove()
  199. {
  200.     int i = Hash( start_node, T[ first_char_index ] );
  201.     while ( Edges[ i ].start_node != start_node ||
  202.             Edges[ i ].first_char_index != first_char_index )
  203.         i = ++i % HASH_TABLE_SIZE;
  204.     for ( ; ; ) {
  205.         Edges[ i ].start_node = -1;
  206.         int j = i;
  207.         for ( ; ; ) {
  208.             i = ++i % HASH_TABLE_SIZE;
  209.             if ( Edges[ i ].start_node == -1 )
  210.                 return;
  211.             int r = Hash( Edges[ i ].start_node, T[ Edges[ i ].first_char_index ] );
  212.             if ( i >= r && r > j )
  213.                 continue;
  214.             if ( r > j && j > i )
  215.                 continue;
  216.             if ( j > i && i >= r )
  217.                 continue;
  218.             break;
  219.         }
  220.         Edges[ j ] = Edges[ i ];
  221.     }
  222. }
  223.  
  224. //
  225. // The whole reason for storing edges in a hash table is that it
  226. // makes this function fairly efficient.  When I want to find a
  227. // particular edge leading out of a particular node, I call this
  228. // function.  It locates the edge in the hash table, and returns
  229. // a copy of it.  If the edge isn't found, the edge that is returned
  230. // to the caller will have start_node set to -1, which is the value
  231. // used in the hash table to flag an unused entry.
  232. //
  233.  
  234. Edge Edge::Find( int node, int c )
  235. {
  236.     int i = Hash( node, c );
  237.     for ( ; ; ) {
  238.         if ( Edges[ i ].start_node == node )
  239.             if ( c == T[ Edges[ i ].first_char_index ] )
  240.                 return Edges[ i ];
  241.         if ( Edges[ i ].start_node == -1 )
  242.             return Edges[ i ];
  243.         i = ++i % HASH_TABLE_SIZE;
  244.     }
  245. }
  246.  
  247. //
  248. // When a suffix ends on an implicit node, adding a new character
  249. // means I have to split an existing edge.  This function is called
  250. // to split an edge at the point defined by the Suffix argument.
  251. // The existing edge loses its parent, as well as some of its leading
  252. // characters.  The newly created edge descends from the original
  253. // parent, and now has the existing edge as a child.
  254. //
  255. // Since the existing edge is getting a new parent and starting
  256. // character, its hash table entry will no longer be valid.  That's
  257. // why it gets removed at the start of the function.  After the parent
  258. // and start char have been recalculated, it is re-inserted.
  259. //
  260. // The number of characters stolen from the original node and given
  261. // to the new node is equal to the number of characters in the suffix
  262. // argument, which is last - first + 1;
  263. //
  264.  
  265. int Edge::SplitEdge( Suffix &s )
  266. {
  267.     Remove();
  268.     Edge *new_edge =
  269.       new Edge( first_char_index,
  270.                 first_char_index + s.last_char_index - s.first_char_index,
  271.                 s.origin_node );
  272.     new_edge->Insert();
  273.     Nodes[ new_edge->end_node ].suffix_node = s.origin_node;
  274.     first_char_index += s.last_char_index - s.first_char_index + 1;
  275.     start_node = new_edge->end_node;
  276.     Insert();
  277.     return new_edge->end_node;
  278. }
  279.  
  280. //
  281. // This routine prints out the contents of the suffix tree
  282. // at the end of the program by walking through the
  283. // hash table and printing out all used edges.  It
  284. // would be really great if I had some code that will
  285. // print out the tree in a graphical fashion, but I don't!
  286. //
  287.  
  288. void dump_edges( int current_n )
  289. {
  290.     cout << " Start  End  Suf  First Last  String\n";
  291.     for ( int j = 0 ; j < HASH_TABLE_SIZE ; j++ ) {
  292.         Edge *s = Edges + j;
  293.         if ( s->start_node == -1 )
  294.             continue;
  295.         cout << setw( 5 ) << s->start_node << " "
  296.              << setw( 5 ) << s->end_node << " "
  297.              << setw( 3 ) << Nodes[ s->end_node ].suffix_node << " "
  298.              << setw( 5 ) << s->first_char_index << " "
  299.              << setw( 6 ) << s->last_char_index << "  ";
  300.         int top;
  301.         if ( current_n > s->last_char_index )
  302.             top = s->last_char_index;
  303.         else
  304.             top = current_n;
  305.         for ( int l = s->first_char_index ;
  306.                   l <= top;
  307.                   l++ )
  308.             cout << T[ l ];
  309.         cout << "\n";
  310.     }
  311. }
  312.  
  313. //
  314. // A suffix in the tree is denoted by a Suffix structure
  315. // that denotes its last character.  The canonical
  316. // representation of a suffix for this algorithm requires
  317. // that the origin_node by the closest node to the end
  318. // of the tree.  To force this to be true, we have to
  319. // slide down every edge in our current path until we
  320. // reach the final node.
  321.  
  322. void Suffix::Canonize()
  323. {
  324.     if ( !Explicit() ) {
  325.         Edge edge = Edge::Find( origin_node, T[ first_char_index ] );
  326.         int edge_span = edge.last_char_index - edge.first_char_index;
  327.         while ( edge_span <= ( last_char_index - first_char_index ) ) {
  328.             first_char_index = first_char_index + edge_span + 1;
  329.             origin_node = edge.end_node;
  330.             if ( first_char_index <= last_char_index ) {
  331.                edge = Edge::Find( edge.end_node, T[ first_char_index ] );
  332.                 edge_span = edge.last_char_index - edge.first_char_index;
  333.             };
  334.         }
  335.     }
  336. }
  337.  
  338. //
  339. // This routine constitutes the heart of the algorithm.
  340. // It is called repetitively, once for each of the prefixes
  341. // of the input string.  The prefix in question is denoted
  342. // by the index of its last character.
  343. //
  344. // At each prefix, we start at the active point, and add
  345. // a new edge denoting the new last character, until we
  346. // reach a point where the new edge is not needed due to
  347. // the presence of an existing edge starting with the new
  348. // last character.  This point is the end point.
  349. //
  350. // Luckily for use, the end point just happens to be the
  351. // active point for the next pass through the tree.  All
  352. // we have to do is update it's last_char_index to indicate
  353. // that it has grown by a single character, and then this
  354. // routine can do all its work one more time.
  355. //
  356.  
  357. void AddPrefix( Suffix &active, int last_char_index )
  358. {
  359.     int parent_node;
  360.     int last_parent_node = -1;
  361.  
  362.     for ( ; ; ) {
  363.         Edge edge;
  364.         parent_node = active.origin_node;
  365. //
  366. // Step 1 is to try and find a matching edge for the given node.
  367. // If a matching edge exists, we are done adding edges, so we break
  368. // out of this big loop.
  369. //
  370.         if ( active.Explicit() ) {
  371.             edge = Edge::Find( active.origin_node, T[ last_char_index ] );
  372.             if ( edge.start_node != -1 )
  373.                 break;
  374.         } else { //implicit node, a little more complicated
  375.             edge = Edge::Find( active.origin_node, T[ active.first_char_index ] );
  376.             int span = active.last_char_index - active.first_char_index;
  377.             if ( T[ edge.first_char_index + span + 1 ] == T[ last_char_index ] )
  378.                 break;
  379.             parent_node = edge.SplitEdge( active );
  380.         }
  381. //
  382. // We didn't find a matching edge, so we create a new one, add
  383. // it to the tree at the parent node position, and insert it
  384. // into the hash table.  When we create a new node, it also
  385. // means we need to create a suffix link to the new node from
  386. // the last node we visited.
  387. //
  388.         Edge *new_edge = new Edge( last_char_index, N, parent_node );
  389.         new_edge->Insert();
  390.         if ( last_parent_node > 0 )
  391.             Nodes[ last_parent_node ].suffix_node = parent_node;
  392.         last_parent_node = parent_node;
  393. //
  394. // This final step is where we move to the next smaller suffix
  395. //
  396.         if ( active.origin_node == 0 )
  397.             active.first_char_index++;
  398.         else
  399.             active.origin_node = Nodes[ active.origin_node ].suffix_node;
  400.         active.Canonize();
  401.     }
  402.     if ( last_parent_node > 0 )
  403.         Nodes[ last_parent_node ].suffix_node = parent_node;
  404.     active.last_char_index++;  //Now the endpoint is the next active point
  405.     active.Canonize();
  406. };
  407.  
  408. int main()
  409. {
  410.     cout << "Normally, suffix trees require that the last\n"
  411.          << "character in the input string be unique.  If\n"
  412.          << "you don't do this, your tree will contain\n"
  413.          << "suffixes that don't end in leaf nodes.  This is\n"
  414.          << "often a useful requirement. You can build a tree\n"
  415.          << "in this program without meeting this requirement,\n"
  416.          << "but the validation code will flag it as being an\n"
  417.          << "invalid tree\n\n";
  418.     cout << "Enter string: " << flush;
  419.     cin.getline( T, MAX_LENGTH - 1 );
  420.     N = strlen( T ) - 1;
  421. //
  422. // The active point is the first non-leaf suffix in the
  423. // tree.  We start by setting this to be the empty string
  424. // at node 0.  The AddPrefix() function will update this
  425. // value after every new prefix is added.
  426. //
  427.     Suffix active( 0, 0, -1 );  // The initial active prefix
  428.     for ( int i = 0 ; i <= N ; i++ )
  429.         AddPrefix( active, i );
  430. //
  431. // Once all N prefixes have been added, the resulting table
  432. // of edges is printed out, and a validation step is
  433. // optionally performed.
  434. //
  435.     dump_edges( N );
  436.     cout << "Would you like to validate the tree?"
  437.          << flush;
  438.     std::string s;
  439.     getline( cin, s );
  440.     if ( s.size() > 0 && s[ 0 ] == 'Y' || s[ 0 ] == 'y' )
  441.         validate();
  442.     return 1;
  443. };
  444.  
  445. //
  446. // The validation code consists of two routines.  All it does
  447. // is traverse the entire tree.  walk_tree() calls itself
  448. // recursively, building suffix strings up as it goes.  When
  449. // walk_tree() reaches a leaf node, it checks to see if the
  450. // suffix derived from the tree matches the suffix starting
  451. // at the same point in the input text.  If so, it tags that
  452. // suffix as correct in the GoodSuffixes[] array.  When the tree
  453. // has been traversed, every entry in the GoodSuffixes array should
  454. // have a value of 1.
  455. //
  456. // In addition, the BranchCount[] array is updated while the tree is
  457. // walked as well.  Every count in the array has the
  458. // number of child edges emanating from that node.  If the node
  459. // is a leaf node, the value is set to -1.  When the routine
  460. // finishes, every node should be a branch or a leaf.  The number
  461. // of leaf nodes should match the number of suffixes (the length)
  462. // of the input string.  The total number of branches from all
  463. // nodes should match the node count.
  464. //
  465.  
  466. char CurrentString[ MAX_LENGTH ];
  467. char GoodSuffixes[ MAX_LENGTH ];
  468. char BranchCount[ MAX_LENGTH * 2 ] = { 0 };
  469.  
  470. void validate()
  471. {
  472.     for ( int i = 0 ; i < N ; i++ )
  473.         GoodSuffixes[ i ] = 0;
  474.     walk_tree( 0, 0 );
  475.     int error = 0;
  476.     for ( int i = 0 ; i < N ; i++ )
  477.         if ( GoodSuffixes[ i ] != 1 ) {
  478.             cout << "Suffix " << i << " count wrong!\n";
  479.             error++;
  480.         }
  481.     if ( error == 0 )
  482.         cout << "All Suffixes present!\n";
  483.     int leaf_count = 0;
  484.     int branch_count = 0;
  485.     for ( int i = 0 ; i < Node::Count ; i++ ) {
  486.         if ( BranchCount[ i ] == 0 )
  487.             cout << "Logic error on node "
  488.                  << i
  489.                  << ", not a leaf or internal node!\n";
  490.         else if ( BranchCount[ i ] == -1 )
  491.             leaf_count++;
  492.         else
  493.             branch_count += BranchCount[ i ];
  494.     }
  495.     cout << "Leaf count : "
  496.          << leaf_count
  497.          << ( leaf_count == ( N + 1 ) ? " OK" : " Error!" )
  498.          << "\n";
  499.     cout << "Branch count : "
  500.          << branch_count
  501.          << ( branch_count == (Node::Count - 1) ? " OK" : " Error!" )
  502.          << endl;
  503. }
  504.  
  505. int walk_tree( int start_node, int last_char_so_far )
  506. {
  507.     int edges = 0;
  508.     for ( int i = 0 ; i < 256 ; i++ ) {
  509.         Edge edge = Edge::Find( start_node, i );
  510.         if ( edge.start_node != -1 ) {
  511.             if ( BranchCount[ edge.start_node ] < 0 )
  512.                 cerr << "Logic error on node "
  513.                      << edge.start_node
  514.                      << '\n';
  515.             BranchCount[ edge.start_node ]++;
  516.             edges++;
  517.             int l = last_char_so_far;
  518.             for ( int j = edge.first_char_index ; j <= edge.last_char_index ; j++ )
  519.                 CurrentString[ l++ ] = T[ j ];
  520.             CurrentString[ l ] = '\0';
  521.             if ( walk_tree( edge.end_node, l ) ) {
  522.                 if ( BranchCount[ edge.end_node ] > 0 )
  523.                         cerr << "Logic error on node "
  524.                              << edge.end_node
  525.                              << "\n";
  526.                 BranchCount[ edge.end_node ]--;
  527.             }
  528.         }
  529.     }
  530. //
  531. // If this node didn't have any child edges, it means we
  532. // are at a leaf node, and can check on this suffix.  We
  533. // check to see if it matches the input string, then tick
  534. // off it's entry in the GoodSuffixes list.
  535. //
  536.     if ( edges == 0 ) {
  537.         cout << "Suffix : ";
  538.         for ( int m = 0 ; m < last_char_so_far ; m++ )
  539.             cout << CurrentString[ m ];
  540.         cout << "\n";
  541.         GoodSuffixes[ strlen( CurrentString ) - 1 ]++;
  542.         cout << "comparing: " << ( T + N - strlen( CurrentString ) + 1 )
  543.              << " to " << CurrentString << endl;
  544.         if ( strcmp(T + N - strlen(CurrentString) + 1, CurrentString ) != 0 )
  545.             cout << "Comparison failure!\n";
  546.         return 1;
  547.     } else
  548.         return 0;
  549. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement