Advertisement
Guest User

Huffman word encoder

a guest
Jul 18th, 2014
305
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.69 KB | None | 0 0
  1. #include <map>
  2. #include <queue>
  3. #include <cctype>
  4. #include <fstream>
  5. #include <iostream>
  6. #include <getopt.h>
  7.  
  8. struct Node {
  9.   std::string word;
  10.   int weight;
  11.   Node *left, *right;
  12.  
  13.   bool operator<(const Node &node) const {
  14.     return weight == node.weight ? word < node.word : weight > node.weight;
  15.   }
  16.  
  17.   bool is_internal() { return word.empty(); }
  18. };
  19.  
  20. struct Huffman {
  21.   /* Initializes from sample text from a stream. */
  22.   Huffman(const char *file) {
  23.     std::ifstream in{file};
  24.     std::map<std::string, int> words;
  25.     std::string word;
  26.     /* Parse each individual word. */
  27.     while (!in.eof()) {
  28.       int c = in.get();
  29.       if (std::isalnum(c)) {
  30.         word.push_back(std::toupper(c));
  31.       } else if (word.size() > 0) {
  32.         words[word]++;
  33.         word.clear();
  34.       }
  35.     }
  36.     /* Build Huffman tree. */
  37.     std::priority_queue<Node> queue;
  38.     for (auto &kv : words) {
  39.       queue.push(Node{kv.first, kv.second, nullptr, nullptr});
  40.     }
  41.     while (queue.size() > 1) {
  42.       Node *left = new Node(queue.top());
  43.       queue.pop();
  44.       Node *right = new Node(queue.top());
  45.       queue.pop();
  46.       queue.push(Node{"", left->weight + right->weight, left, right});
  47.     }
  48.     root = new Node(queue.top());
  49.     std::vector<bool> empty;
  50.     fill(root, empty);
  51.   }
  52.  
  53.   /* Fill out the lookup table. */
  54.   void fill(Node *root, std::vector<bool> &bits) {
  55.     if (root->is_internal()) {
  56.       bits.push_back(false);
  57.       fill(root->left, bits);
  58.       bits.pop_back();
  59.       bits.push_back(true);
  60.       fill(root->right, bits);
  61.       bits.pop_back();
  62.     } else {
  63.       lookup[root->word] = bits;
  64.     }
  65.   }
  66.  
  67.   /* Add word to output. */
  68.   void add(std::string word) {
  69.     auto &encoding = lookup[word];
  70.     if (encoding.empty()) {
  71.       std::cerr << "Unknown word: " << word << std::endl;
  72.       exit(1);
  73.     } else {
  74.       for (const auto &b : encoding) bits.push_back(b);
  75.     }
  76.   }
  77.  
  78.   void decompress(std::istream &in, std::ostream &out) {
  79.     uint8_t c = in.get();
  80.     int tail = c >> 5 == 0 ? 8 : c >> 5;
  81.     c <<= 3;
  82.     int count = 5;
  83.     Node *node = root;
  84.     while (true) {
  85.       while (count > 0) {
  86.         if (node->is_internal()) {
  87.           node = c & 0x80 ? node->right : node->left;
  88.           c <<= 1;
  89.           count--;
  90.         } else {
  91.           out << node->word << " ";
  92.           node = root;
  93.         }
  94.       }
  95.       if (in.eof()) break;
  96.       c = in.get();
  97.       in.peek();  // peek for EOF
  98.       count = in.eof() ? tail : 8;
  99.     }
  100.     if (!node->is_internal()) out << node->word << " ";
  101.   }
  102.  
  103.   std::map<std::string, std::vector<bool>> lookup;
  104.   std::vector<bool> bits;
  105.   Node *root;
  106. };
  107.  
  108. /* Write encoding to output. */
  109. std::ostream &operator<<(std::ostream &out, const Huffman &encoder) {
  110.   uint8_t fill = (encoder.bits.size() + 3) % 8;
  111.   int count = 3;
  112.   for (const auto &b : encoder.bits) {
  113.     if (count == 8) {
  114.       out.write((char *)&fill, 1);
  115.       count = 0;
  116.     }
  117.     fill <<= 1;
  118.     fill |= b;
  119.     count++;
  120.   }
  121.   fill <<= 8 - count;
  122.   out.write((char *)&fill, 1);
  123.   return out;
  124. }
  125.  
  126. int main(int argc, char **argv) {
  127.   const char *dict = "/dev/null";
  128.   bool compress = true;
  129.   int c;
  130.   while ((c = getopt(argc, argv, "cdf:")) != -1) {
  131.     switch (c) {
  132.       case 'c':
  133.         compress = true;
  134.         break;
  135.       case 'd':
  136.         compress = false;
  137.         break;
  138.       case 'f':
  139.         dict = optarg;
  140.         break;
  141.     }
  142.   }
  143.  
  144.   Huffman encoder{dict};
  145.   if (compress) {
  146.     std::string word;
  147.     while (std::cin >> word) {
  148.       encoder.add(word);
  149.     }
  150.     std::cout << encoder;
  151.   } else {
  152.     encoder.decompress(std::cin, std::cout);
  153.     std::cout << std::endl;
  154.   }
  155.   return 0;
  156. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement