Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <fstream>
- #include <algorithm>
- #include <map>
- #include <vector>
- #include <iterator>
- #include <numeric>
- #include <string.h>
- using namespace std;
- class BKNode;
- /**
- * Implementation of levenshtein distance that avoids any heap allocations and
- * only needs O(2 * min(length(one), length(two))) space.
- */
- int levenshtein(const string& one, const string& two) {
- if (one.length() > two.length()) {
- return levenshtein(two, one);
- }
- int length1 = one.length() + 1;
- int length2 = two.length() + 1;
- int data1[length1];
- int data2[length1];
- int* current = data1;
- int* previous = data2;
- memset(data1, 0, length1 * sizeof(int));
- for (int i = 0; i < length1; i++) {
- data2[i] = i;
- }
- for (int i = 1; i < length2; i++) {
- if (i > 1) {
- memset(previous, 0, length1 * sizeof(int));
- int* tmp = previous;
- previous = current;
- current = tmp;
- }
- current[0] = i;
- for (int j = 1; j < length1; j++) {
- current[j] = min(previous[j] + 1, min(current[j-1] + 1, previous[j-1] +
- (one[j-1] != two[i-1] ? 1 : 0)));
- }
- }
- return current[length1-1];
- }
- /**
- * Node structure of the BK-tree.
- */
- class BKNode {
- private:
- string value;
- map<int, BKNode> children;
- static int count;
- public:
- BKNode(const string& value)
- : value(value) {
- }
- BKNode() {
- }
- /**
- * Inserts a value into the tree.
- */
- void insert(const string& value) {
- int distance = levenshtein(this->value, value);
- if (distance == 0) {
- return;
- }
- map<int,BKNode>::const_iterator i(children.find(distance));
- if (i != children.end()) {
- // use [] operator to avoid copy constructor
- children[distance].insert(value);
- } else {
- // only copy constructor call of BKNode
- children[distance] = BKNode(value);
- }
- }
- /**
- * Computes minimum distance between value and this node or its children.
- */
- int minimumTreeDistance(const string& value, int currentMin) {
- ++count; // count invocations
- int distance = levenshtein(this->value, value);
- if (distance == 0 || children.empty()) {
- return distance;
- }
- currentMin = min(distance, currentMin);
- map<int,BKNode>::const_iterator i(children.find(distance));
- if (i != children.end()) {
- currentMin = min(children[distance].minimumTreeDistance(value, currentMin),
- currentMin);
- }
- if (currentMin <= 1) {
- return currentMin;
- }
- for (int i = distance - currentMin + 1; i < distance + currentMin; ++i) {
- if (i != distance && children.find(i) != children.end()) {
- currentMin = min(children[i].minimumTreeDistance(value, currentMin),
- currentMin);
- if (currentMin <= 1) {
- return currentMin;
- }
- }
- }
- return currentMin;
- }
- };
- int BKNode::count = 0;
- /**
- * Implementation of a Burkhard Keller tree for efficient retrieval
- * of string values based on the Levenshtein distance as metric.
- *
- * Most logic is contained in BKNode class.
- */
- class BKTree : public unary_function<int, const string&> {
- private:
- BKNode root;
- int count;
- public:
- BKTree(const string& rootValue)
- : root(BKNode(rootValue)) {
- }
- void insert(const string& value) {
- root.insert(value);
- ++count;
- }
- /**
- * Computes minimal distance between value and any value in the BK-tree.
- */
- int operator() (const string& value) {
- int distance = root.minimumTreeDistance(value, value.length());
- #ifdef VERBOSE
- cout << value << " " << BKNode::count << endl;
- BKNode::count = 0;
- #endif
- return distance;
- }
- int size() const {
- return count;
- }
- };
- /**
- * Trims white space on the right hand side of string. Manipulates
- * string argument.
- */
- string& rtrim(string& value) {
- size_t last = value.find_last_not_of(" \n\r\t");
- if (last != string::npos) {
- value.erase(last+1);
- }
- return value;
- }
- int main(int argc, char** argv) {
- //ifstream file("/tmp/twl06.txt");
- ifstream file("/var/tmp/twl06.txt");
- // build tree from correct words
- BKTree tree("");
- if (file.is_open() && !file.eof()) {
- string line;
- getline(file, line);
- tree = BKTree(rtrim(line));
- while (!file.eof()) {
- getline(file, line);
- tree.insert(rtrim(line));
- }
- file.close();
- }
- ifstream input(argv[1]);
- if (input.is_open()) {
- vector<string> tokens;
- // read words from input file
- copy(istream_iterator<string>(input), istream_iterator<string>(),
- back_inserter<vector <string> >(tokens));
- input.close();
- // trim potential \r's
- transform(tokens.begin(), tokens.end(), tokens.begin(), rtrim);
- // convert tokens to upper case
- for (vector<string>::iterator i = tokens.begin(); i != tokens.end(); i++) {
- transform((*i).begin(), (*i).end(), (*i).begin(), (int(*)(int))toupper);
- }
- // mapp to distances from tree
- vector<int> distances;
- transform(tokens.begin(), tokens.end(), back_inserter<vector <int> >(distances), tree);
- // sum distances
- cout << accumulate(distances.begin(), distances.end(), 0) << endl;
- }
- }
- /*
- * Local Variables:
- * compile-command: "g++ -O3 -o breathalyzer breathalyzer.cpp"
- * End:
- */
Add Comment
Please, Sign In to add comment