Guest User

Untitled

a guest
May 20th, 2018
126
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.04 KB | None | 0 0
  1. #include <iostream>
  2. #include <fstream>
  3. #include <algorithm>
  4. #include <map>
  5. #include <vector>
  6. #include <iterator>
  7. #include <numeric>
  8.  
  9. #include <string.h>
  10.  
  11. using namespace std;
  12.  
  13. class BKNode;
  14.  
  15. /**
  16. * Implementation of levenshtein distance that avoids any heap allocations and
  17. * only needs O(2 * min(length(one), length(two))) space.
  18. */
  19. int levenshtein(const string& one, const string& two) {
  20. if (one.length() > two.length()) {
  21. return levenshtein(two, one);
  22. }
  23. int length1 = one.length() + 1;
  24. int length2 = two.length() + 1;
  25. int data1[length1];
  26. int data2[length1];
  27. int* current = data1;
  28. int* previous = data2;
  29. memset(data1, 0, length1 * sizeof(int));
  30. for (int i = 0; i < length1; i++) {
  31. data2[i] = i;
  32. }
  33. for (int i = 1; i < length2; i++) {
  34. if (i > 1) {
  35. memset(previous, 0, length1 * sizeof(int));
  36. int* tmp = previous;
  37. previous = current;
  38. current = tmp;
  39. }
  40. current[0] = i;
  41. for (int j = 1; j < length1; j++) {
  42. current[j] = min(previous[j] + 1, min(current[j-1] + 1, previous[j-1] +
  43. (one[j-1] != two[i-1] ? 1 : 0)));
  44. }
  45. }
  46. return current[length1-1];
  47. }
  48.  
  49. /**
  50. * Node structure of the BK-tree.
  51. */
  52. class BKNode {
  53.  
  54. private:
  55. string value;
  56. map<int, BKNode> children;
  57. static int count;
  58.  
  59. public:
  60. BKNode(const string& value)
  61. : value(value) {
  62. }
  63.  
  64. BKNode() {
  65. }
  66.  
  67. /**
  68. * Inserts a value into the tree.
  69. */
  70. void insert(const string& value) {
  71. int distance = levenshtein(this->value, value);
  72. if (distance == 0) {
  73. return;
  74. }
  75. map<int,BKNode>::const_iterator i(children.find(distance));
  76. if (i != children.end()) {
  77. // use [] operator to avoid copy constructor
  78. children[distance].insert(value);
  79. } else {
  80. // only copy constructor call of BKNode
  81. children[distance] = BKNode(value);
  82. }
  83. }
  84.  
  85. /**
  86. * Computes minimum distance between value and this node or its children.
  87. */
  88. int minimumTreeDistance(const string& value, int currentMin) {
  89. ++count; // count invocations
  90. int distance = levenshtein(this->value, value);
  91. if (distance == 0 || children.empty()) {
  92. return distance;
  93. }
  94. currentMin = min(distance, currentMin);
  95. map<int,BKNode>::const_iterator i(children.find(distance));
  96. if (i != children.end()) {
  97. currentMin = min(children[distance].minimumTreeDistance(value, currentMin),
  98. currentMin);
  99. }
  100. if (currentMin <= 1) {
  101. return currentMin;
  102. }
  103. for (int i = distance - currentMin + 1; i < distance + currentMin; ++i) {
  104. if (i != distance && children.find(i) != children.end()) {
  105. currentMin = min(children[i].minimumTreeDistance(value, currentMin),
  106. currentMin);
  107. if (currentMin <= 1) {
  108. return currentMin;
  109. }
  110. }
  111. }
  112. return currentMin;
  113. }
  114. };
  115.  
  116. int BKNode::count = 0;
  117.  
  118. /**
  119. * Implementation of a Burkhard Keller tree for efficient retrieval
  120. * of string values based on the Levenshtein distance as metric.
  121. *
  122. * Most logic is contained in BKNode class.
  123. */
  124. class BKTree : public unary_function<int, const string&> {
  125.  
  126. private:
  127. BKNode root;
  128. int count;
  129.  
  130. public:
  131. BKTree(const string& rootValue)
  132. : root(BKNode(rootValue)) {
  133. }
  134.  
  135. void insert(const string& value) {
  136. root.insert(value);
  137. ++count;
  138. }
  139.  
  140. /**
  141. * Computes minimal distance between value and any value in the BK-tree.
  142. */
  143. int operator() (const string& value) {
  144. int distance = root.minimumTreeDistance(value, value.length());
  145. #ifdef VERBOSE
  146. cout << value << " " << BKNode::count << endl;
  147. BKNode::count = 0;
  148. #endif
  149. return distance;
  150. }
  151.  
  152. int size() const {
  153. return count;
  154. }
  155. };
  156.  
  157. /**
  158. * Trims white space on the right hand side of string. Manipulates
  159. * string argument.
  160. */
  161. string& rtrim(string& value) {
  162. size_t last = value.find_last_not_of(" \n\r\t");
  163. if (last != string::npos) {
  164. value.erase(last+1);
  165. }
  166. return value;
  167. }
  168.  
  169. int main(int argc, char** argv) {
  170. //ifstream file("/tmp/twl06.txt");
  171. ifstream file("/var/tmp/twl06.txt");
  172. // build tree from correct words
  173. BKTree tree("");
  174. if (file.is_open() && !file.eof()) {
  175. string line;
  176. getline(file, line);
  177. tree = BKTree(rtrim(line));
  178. while (!file.eof()) {
  179. getline(file, line);
  180. tree.insert(rtrim(line));
  181. }
  182. file.close();
  183. }
  184. ifstream input(argv[1]);
  185. if (input.is_open()) {
  186. vector<string> tokens;
  187. // read words from input file
  188. copy(istream_iterator<string>(input), istream_iterator<string>(),
  189. back_inserter<vector <string> >(tokens));
  190. input.close();
  191. // trim potential \r's
  192. transform(tokens.begin(), tokens.end(), tokens.begin(), rtrim);
  193. // convert tokens to upper case
  194. for (vector<string>::iterator i = tokens.begin(); i != tokens.end(); i++) {
  195. transform((*i).begin(), (*i).end(), (*i).begin(), (int(*)(int))toupper);
  196. }
  197. // mapp to distances from tree
  198. vector<int> distances;
  199. transform(tokens.begin(), tokens.end(), back_inserter<vector <int> >(distances), tree);
  200. // sum distances
  201. cout << accumulate(distances.begin(), distances.end(), 0) << endl;
  202. }
  203. }
  204.  
  205. /*
  206. * Local Variables:
  207. * compile-command: "g++ -O3 -o breathalyzer breathalyzer.cpp"
  208. * End:
  209. */
Add Comment
Please, Sign In to add comment