Advertisement
keverman

gtrie (quick predefined key approximated search)

Nov 25th, 2018 (edited)
223
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 4.50 KB | None | 0 0
  1. // Goof trie complexities:
  2. // n - key length, A - alphabet size
  3. // md - maximal allowed Levenshtein distance
  4. // add, erase - O(n * log(A))
  5. // find - O(n * A ^ md) [closer to O(A ^ md) for real-world data]
  6.  
  7. // A structure that represents a node in a goof trie
  8. template<class T>
  9. struct gtrie_node
  10. {
  11.     std::map<char, gtrie_node<T>*> sub_nodes;
  12.     std::optional<T> value;
  13.  
  14.     ~gtrie_node()
  15.     {
  16.         for (auto& sub_node : sub_nodes)
  17.             delete sub_node.second;
  18.     }
  19. };
  20.  
  21. // Helper structure used to find an apporimxation of gtrie query
  22. template <class T>
  23. struct gtrie_snode
  24. {
  25.     gtrie_node<T>* node;
  26.     int depth, distance;
  27.  
  28.     gtrie_snode() {}
  29.     gtrie_snode(gtrie_node<T>* node, int depth, int distance)
  30.     {
  31.         this->node = node;
  32.         this->depth = depth;
  33.         this->distance = distance;
  34.     }
  35. };
  36.  
  37. // Compares two gtrie snodes by their distance and then depth
  38. template <class T>
  39. struct gtrie_comparator
  40. {
  41.     bool operator()(const gtrie_snode<T>& l, const gtrie_snode<T>& r)
  42.     {
  43.         if (l.distance == r.distance) return l.depth > r.depth;
  44.         return l.distance > r.distance;
  45.     }
  46. };
  47.  
  48. // Goof trie is a trie-like structure that allows for inaccuracies when searching for a key
  49. // Using the find function, user can search through the dictionary for a phrase with
  50. // the smallest Levenshtein distance (in reference to the key string)
  51. template<class T>
  52. class gtrie
  53. {
  54. private:
  55.  
  56.     gtrie_node<T> root;
  57.     T default_result;
  58.  
  59. public:
  60.  
  61.     gtrie(T default_result = T())
  62.     {
  63.         this->default_result = default_result;
  64.     }
  65.  
  66.     // Uses key to build T on the trie
  67.     void add(std::string key, T value)
  68.     {
  69.         gtrie_node<T>* cur = &root;
  70.  
  71.         for (char c : key)
  72.         {
  73.             if (cur->sub_nodes.find(c) == cur->sub_nodes.end())
  74.                 cur->sub_nodes[c] = new gtrie_node<T>();
  75.             cur = cur->sub_nodes[c];
  76.         }
  77.  
  78.         cur->value = value;
  79.     }
  80.  
  81.     // Erases the key from the tree, cleaning up empty nodes
  82.     void erase(std::string key)
  83.     {
  84.         erase(key, 0, &root);
  85.     }
  86.  
  87.     // Recursive erease function
  88.     bool erase(std::string& key, int pos, gtrie_node<T>* node)
  89.     {
  90.         // Hit the end of the tree
  91.         if (pos == key.size())
  92.         {
  93.             node->value = default_result;
  94.             return true;
  95.         }
  96.  
  97.         // Continue descent if a the key exists on the tree
  98.         if (node->sub_nodes.find(key[pos]) != node->sub_nodes.end())
  99.         {
  100.             // Call subroutine, and consider erasing subnodes if it returned true
  101.             gtrie_node<T>* next = node->sub_nodes[key[pos]];
  102.             if (!erase(key, pos + 1, next))
  103.                 return false;
  104.  
  105.             // There are no other keys extending from the current branch, so delete the node
  106.             if (next->sub_nodes.empty())
  107.             {
  108.                 node->sub_nodes.erase(key[pos]);
  109.                 delete next;
  110.                 return true;
  111.             }
  112.         }
  113.  
  114.         return false;
  115.     }
  116.  
  117.     // Returns the best match (with smallest distance) found in the tree
  118.     T find(std::string phrase, int max_distance = 1)
  119.     {
  120.         std::vector<gtrie_snode<T>> res;    // Search results
  121.         std::priority_queue<gtrie_snode<T>,
  122.             std::vector<gtrie_snode<T>>,
  123.             gtrie_comparator<T>> PQ;        // Nodes priority queue
  124.  
  125.         // Rotate through all eligible search nodes starting from the root
  126.         PQ.push(gtrie_snode(&root, 0, 0));
  127.         while (!PQ.empty())
  128.         {
  129.             gtrie_snode<T> snode = PQ.top(); PQ.pop();         // Current node
  130.             int distance_left = max_distance - snode.distance; // Current node's left distance
  131.  
  132.             // If the node is in range of word distance and it is the last letter of gtrie's entry then it's a match
  133.             if ((int)phrase.size() <= snode.depth + distance_left &&
  134.                 (int)phrase.size() >= std::max(0, snode.depth - distance_left) &&
  135.                 snode.node->value.has_value())
  136.             {
  137.                 res.push_back(snode);
  138.  
  139.                 // Break if the match is perfect
  140.                 if (snode.distance == 0 && snode.depth == phrase.size())
  141.                     break;
  142.             }
  143.  
  144.             // Continue descent
  145.             else if (snode.depth < phrase.size())
  146.             {
  147.                 // Queue eglible subnodes
  148.                 for (auto itn : snode.node->sub_nodes)
  149.                 {
  150.                     if (itn.first == phrase[snode.depth])   PQ.push({ itn.second, snode.depth + 1, snode.distance });
  151.                     else if (snode.distance < max_distance) PQ.push({ itn.second, snode.depth + 1, snode.distance + 1 });
  152.                 }
  153.  
  154.                 // Skip the current character
  155.                 if (snode.distance < max_distance)
  156.                     PQ.push({ snode.node, snode.depth + 1, snode.distance + 1 });
  157.             }
  158.         }
  159.  
  160.         // If the search results aren't empty, assign the return value
  161.         // Return value should have the smallest distance possible
  162.         if (res.empty())
  163.             return default_result;
  164.  
  165.         int mn = 0;
  166.         for (int i = 1; i < res.size(); i++)
  167.             if (res[i].distance < res[mn].distance)
  168.                 mn = i;
  169.         return res[mn].node->value.value();
  170.     }
  171. };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement