Advertisement
Guest User

Untitled

a guest
Nov 21st, 2014
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 13.75 KB | None | 0 0
  1. #include <fstream>
  2. #include <iterator>
  3. #include <vector>
  4. #include <algorithm>
  5. #include <unordered_map>
  6. #include <iostream>
  7. #include <stdexcept>
  8. #include <bitset>
  9.  
  10. struct Node {
  11.    
  12. public:
  13.     Node(char symbol, int frequency, char part_of_code)
  14.     : symbol_(symbol),
  15.     frequency_(frequency),
  16.     code_(part_of_code),
  17.     parent_(NULL),
  18.     left_(NULL),
  19.     right_(NULL) {
  20.     }
  21.    
  22.     Node(Node *a, Node *b)
  23.     : symbol_('0'),
  24.     frequency_(a->get_frequency() + b->get_frequency()),
  25.     left_(a),
  26.     right_(b),
  27.     code_('0'),
  28.     parent_(NULL) {
  29.         a->set_parent(this);
  30.         b->set_parent(this);
  31.         a->set_code('0');
  32.         b->set_code('1');
  33.     }
  34.    
  35.     ~Node() {
  36.         delete left_;
  37.         delete right_;
  38.     }
  39.    
  40.     void set_code(char code) {
  41.         code_ = code;
  42.     }
  43.    
  44.     void set_parent(Node *parent) {
  45.         parent_ = parent;
  46.     }
  47.    
  48.     Node *get_parent() {
  49.         return parent_;
  50.     }
  51.    
  52.     Node *get_left() {
  53.         return left_;
  54.     }
  55.    
  56.     Node *get_right() {
  57.         return right_;
  58.     }
  59.    
  60.     char get_symbol() {
  61.         return symbol_;
  62.     }
  63.    
  64.     int get_frequency() {
  65.         return frequency_;
  66.     }
  67.    
  68.     char get_code() {
  69.         return code_;
  70.     }
  71.    
  72.    
  73. private:
  74.     char symbol_;
  75.     int frequency_;
  76.     Node *parent_;
  77.     Node *left_;
  78.     Node *right_;
  79.     char code_;
  80. };
  81.  
  82. struct HuffmanTree {
  83.    
  84. public:
  85.    
  86.     HuffmanTree(std::vector <std::pair<char, int> > *pairs) {
  87.         root = NULL;
  88.         leafs = new std::vector <Node*>();
  89.         if (pairs->size() == 0) {return;}
  90.         else if (pairs->size() == 1) {
  91.             Node *node = new Node((*pairs)[0].first,(*pairs)[0].second, '0');
  92.             leafs->push_back(node);
  93.             root = node;
  94.         }
  95.         else {
  96.             int root_frequency = std::numeric_limits<int>::max();
  97.             int pointer = 0;
  98.            
  99.             while (pointer + 1 < pairs->size()) {
  100.                
  101.                 int first_frequency = (*pairs)[pointer].second;
  102.                 int second_frequency = (*pairs)[pointer + 1].second;
  103.                 if (second_frequency < root_frequency) {
  104.                     connect_two_nodes((*pairs)[pointer], (*pairs)[pointer + 1]);
  105.                     pointer += 2;
  106.                 } else {
  107.                     if (first_frequency < root_frequency) {connect_one_node((*pairs)[pointer], true);}
  108.                     else {connect_one_node((*pairs)[pointer], false);}
  109.                     pointer++;
  110.                 }
  111.                 root_frequency = root->get_frequency();
  112.             }
  113.        
  114.             if (pointer + 1 == pairs->size()) {
  115.                 if ((*pairs)[pointer].second < root_frequency) {connect_one_node((*pairs)[pointer], true);}
  116.                 else {connect_one_node((*pairs)[pointer], false);}
  117.             }
  118.         }
  119.     }
  120.    
  121.     ~HuffmanTree() {
  122.         delete root;
  123.         delete leafs;
  124.     }
  125.    
  126.    
  127.     std::unordered_map<char, std::string> *get_code_table() {
  128.         std::unordered_map<char, std::string> *code_table = new std::unordered_map<char, std::string>();
  129.         for (size_t i = 0; i != (unsigned int) leafs->size(); ++i) {
  130.             (*code_table)[(*leafs)[i]->get_symbol()] = get_code(*(*leafs)[i]);
  131.         }
  132.         return code_table;
  133.     }
  134.    
  135.     char get_symbol(char* buffer, size_t &buffer_pointer){
  136.        
  137.         size_t old_pointer = buffer_pointer;
  138.         Node *tree_pointer = root;
  139.         while(tree_pointer->get_left() != NULL || tree_pointer->get_right() != NULL){
  140.             if(buffer[buffer_pointer] == '0'){
  141.                 tree_pointer = tree_pointer->get_left();
  142.             }else if(buffer[buffer_pointer] == '1'){
  143.                 tree_pointer = tree_pointer->get_right();
  144.             }
  145.             buffer_pointer++;
  146.         }
  147.         for(int i = old_pointer; i < buffer_pointer; ++i){
  148.             std::cout << buffer[i];
  149.         }
  150.         std::cout << std::endl;
  151.         std::cout << "returning symbol is : " << tree_pointer->get_symbol()<< std::endl;
  152.         return tree_pointer->get_symbol();
  153.     }
  154.    
  155. private:
  156.  
  157.     Node *root;
  158.     std::vector<Node*> *leafs;
  159.    
  160.     std::string get_code(Node &node) {
  161.         Node *pointer = &node;
  162.         std::string code;
  163.         while (pointer != root) {
  164.             code += pointer->get_code();
  165.             pointer = pointer->get_parent();
  166.         }
  167.         std::reverse(code.rbegin(), code.rend());
  168.         return code;
  169.     }
  170.    
  171.     void connect_two_nodes(std::pair<char, int>  pair1, std::pair<char, int>  pair2) {
  172.        
  173.         Node *left_node = new Node(pair1.first, pair1.second, '1');
  174.         Node *right_node = new Node(pair2.first, pair2.second, '0');
  175.         Node *left_and_right_merged = new Node(left_node, right_node);
  176.         leafs->push_back(left_node);
  177.         leafs->push_back(right_node);
  178.        
  179.         if (root == NULL) {
  180.             root = left_and_right_merged;
  181.         } else {
  182.             Node *old_root = root;
  183.             old_root->set_code('1');
  184.             left_and_right_merged->set_code('0');
  185.             Node *new_root = new Node(old_root, left_and_right_merged);
  186.             root = new_root;
  187.         }
  188.     }
  189.    
  190.     void connect_one_node(std::pair<char, int>  pair, bool connect_as_right) {
  191.         if (connect_as_right) {
  192.             Node *leaf = new Node(pair.first, pair.second, '0');
  193.             root->set_code('1');
  194.             leafs->push_back(leaf);
  195.             Node *new_root = new Node(leaf, root);
  196.             root = new_root;
  197.         } else {
  198.             Node *leaf = new Node(pair.first, pair.second, '1');
  199.             leafs->push_back(leaf);
  200.             root->set_code('0');
  201.             Node *new_root = new Node(root, leaf);
  202.             root = new_root;
  203.         }
  204.     }
  205. };
  206.  
  207. struct frequency_pair_comparator {
  208.     inline bool operator()(const std::pair<char, int> &a, const std::pair<char, int> &b) {
  209.         return (a.second < b.second);
  210.     }
  211. };
  212.  
  213. std::vector<char> *get_symbols(char* input_name) {
  214.    
  215.     std::ifstream input (input_name, std::ifstream::binary);
  216.    
  217.     if(!input){
  218.         return new std::vector<char>();
  219.     }
  220.    
  221.     input.seekg (0, input.end);
  222.     size_t length = input.tellg();
  223.     input.seekg (0, input.beg);
  224.     std::cout << length << std::endl;
  225.    
  226.    
  227.     char *buffer = new char [length];
  228.     input.read(buffer, length);
  229.     std::vector<char> *symbols = new std::vector<char>();
  230.     for(size_t i = 0; i < length; i++){
  231.         symbols->push_back(buffer[i]);
  232.     }
  233.     input.close();
  234.     delete buffer;
  235.     return symbols;
  236. }
  237.  
  238.  
  239. std::vector<std::pair<char, int> > *get_frequencies(std::vector<char>  *symbols) {
  240.    
  241.     std::unordered_map<char, int> frequencies;
  242.     for (size_t i = 0; i != symbols->size(); ++i) {
  243.         if (frequencies[(*symbols)[i]] == 0) {
  244.             frequencies[(*symbols)[i]] = 1;
  245.         } else {
  246.             frequencies[(*symbols)[i]] += 1;
  247.         }
  248.     }
  249.     std::vector<std::pair<char, int> > *sortable_frequencies = new std::vector<std::pair<char, int> >();
  250.     for(std::unordered_map<char, int>::iterator it = frequencies.begin(); it != frequencies.end(); ++it){
  251.         sortable_frequencies->push_back( std::pair<char, int>(it->first, it->second));
  252.     }
  253.     std::sort(sortable_frequencies->begin(), sortable_frequencies->end(), frequency_pair_comparator());
  254.     return sortable_frequencies;
  255. }
  256.  
  257.  
  258.  
  259. void write_coded_data(char* output_name, std::vector<std::pair<char, int> > *frequences, std::unordered_map<char, std::string> *code_table, std::vector<char> *symbols){
  260.    
  261.     long long tree_size = 0;
  262.     std::ofstream outfile (output_name, std::ofstream::binary);
  263.    
  264.     outfile << "  " <<frequences->size() << std::endl;
  265.     tree_size += sizeof(frequences->size());
  266.     for(std::vector<std::pair<char, int> >::iterator it = frequences->begin(); it != frequences->end(); ++it){
  267.         outfile << it->first << " " << it->second << std::endl;
  268.         tree_size += (sizeof(it->second) + sizeof(it->first) + sizeof(" ") + 1);
  269.     }
  270.     std::cout << tree_size << std::endl;
  271.    
  272.     const size_t buffer_size = 8;
  273.     char buffer[buffer_size + 1];
  274.     size_t buffer_pointer = 0;
  275.     size_t code_pointer = 0;
  276.     size_t code_size = 0;
  277.     std::string code;
  278.     size_t symbols_pointer = 0;
  279.     long long new_symbols_counter = 0;
  280.    
  281.     while(symbols_pointer != (unsigned int) symbols->size() || (symbols_pointer == (unsigned int) symbols->size() && code_size - code_pointer >= 8)){
  282.         if(code_size == 0){
  283.             code = (*code_table)[(*symbols)[symbols_pointer]];
  284.             code_size = code.length();
  285.             symbols_pointer++;
  286.         }
  287.         while(buffer_pointer < buffer_size && code_pointer < code_size){
  288.             buffer[buffer_pointer] = code.at(code_pointer);
  289.             ++code_pointer;
  290.             ++buffer_pointer;
  291.         }
  292.         if(buffer_pointer == buffer_size){
  293.             buffer_pointer = 0;
  294.             unsigned long i = 0;
  295.             try {
  296.                 std::bitset<buffer_size> bit_buffer ((std::string(buffer)));
  297.                 i = bit_buffer.to_ulong();
  298.             }
  299.             catch (const std::invalid_argument& ia) {}
  300.             char char_buffer = static_cast<unsigned char>( i );
  301.             outfile.write(&char_buffer, sizeof(char_buffer));
  302.             new_symbols_counter += sizeof(char);
  303.         }
  304.  
  305.         if(code_pointer == code_size){
  306.             code_size = 0;
  307.             code_pointer = 0;
  308.         }
  309.     }
  310.    
  311.     if(buffer_pointer != buffer_size){
  312.         unsigned long i = 0;
  313.         try {
  314.             std::bitset<buffer_size> bit_buffer ((std::string(buffer)));
  315.             i = bit_buffer.to_ulong();
  316.         }
  317.         catch (const std::invalid_argument& ia) {}
  318.         char char_buffer = static_cast<unsigned char>( i );
  319.         outfile.write(&char_buffer, sizeof(char_buffer));
  320.         outfile.seekp(std::ios_base::beg);
  321.         outfile << buffer_pointer << std::endl;
  322.     }else{
  323.         outfile.seekp(std::ios_base::beg);
  324.         outfile << 0 << std::endl;
  325.        
  326.     }
  327.     std::cout << new_symbols_counter << std::endl;
  328.     outfile.close();
  329. }
  330.  
  331.  
  332.  
  333. void compress_file(char* input_name, char* output_name) {
  334.    
  335.     std::vector<char> *symbols = get_symbols(input_name);
  336.     std::vector<std::pair<char, int> > *frequencies_table = get_frequencies(symbols);
  337.    
  338.     HuffmanTree tree(frequencies_table);
  339.     std::unordered_map<char, std::string> *code_table = tree.get_code_table();
  340.     write_coded_data(output_name, frequencies_table, code_table, symbols);
  341.    
  342.     for(std::unordered_map<char, std::string>::iterator it = code_table->begin(); it != code_table->end(); ++it){
  343.         std::cout << it->first << " " << it->second << std::endl;
  344.     }
  345.     delete symbols;
  346.     delete frequencies_table;
  347.     delete code_table;
  348. }
  349.  
  350. std::vector<std::pair<char, int> > *get_frequences_from_file(std::ifstream &input){
  351.     int number_of_symbols = 0;
  352.     input >> number_of_symbols;
  353.     std:: vector<std::pair<char, int> > *frequences = new std::vector<std::pair<char, int> >();
  354.     for(int i = 0; i != number_of_symbols; ++i){
  355.         char symbol = 0;
  356.         input >> symbol;
  357.         int frequency = 0;
  358.         input >> frequency;
  359.         frequences->push_back(std::pair<char, int>(symbol, frequency));
  360.     }
  361.     return frequences;
  362. }
  363.  
  364. std::vector<unsigned char> *get_data(std::ifstream &input){
  365.     std::vector<unsigned char> *symbols = new std::vector<unsigned char>();
  366.     for(std::istreambuf_iterator<char> it(input), e; it != e; ++it){
  367.         symbols->push_back(*it);
  368.     }
  369.     symbols->erase(symbols->begin(), symbols->begin() + 1);
  370.     std::cout << symbols->size() <<  std::endl;
  371.     return symbols;
  372. }
  373.  
  374. void make_source_file(char* output_name ,std::vector<std::pair<char, int> > *frequences, char* bits_buffer, size_t bits_buffer_size){
  375.    
  376.     HuffmanTree tree(frequences);
  377.     size_t pointer = 0;
  378.     std::ofstream outfile (output_name, std::ofstream::binary);
  379.    
  380.     while(pointer < bits_buffer_size){
  381.         char symbol = tree.get_symbol(bits_buffer, pointer);
  382.         outfile.write(&symbol, sizeof(symbol));
  383.     }
  384.     outfile.close();
  385. }
  386. void get_source_file(char* input_name, char* output_name){
  387.    
  388.    
  389.     std::ifstream input (input_name, std::ifstream::binary);
  390.     if(!input){
  391.         return;
  392.     }
  393.     int last_symbol_significant_bits = 0;
  394.     input >> last_symbol_significant_bits;
  395.     std::vector<std::pair<char, int> > *frequences = get_frequences_from_file(input);
  396.    
  397.     std::vector<unsigned char> *symbols = get_data(input);
  398.     input.close();
  399.     std::cout << symbols->size() << std::endl;
  400.  
  401.     const size_t bits_buffer_size = 8 * (symbols->size() - 1) + last_symbol_significant_bits;
  402.     char *bits_buffer = new char[bits_buffer_size];
  403.     size_t buffer_pointer = 0;
  404.     for(size_t i = 0; i < symbols->size() - 1; ++i){
  405.         std::string bit_string = std::bitset<8>((*symbols)[i]).to_string();
  406.         for(size_t k = 0; k < 8; k++){
  407.             bits_buffer[buffer_pointer] = bit_string.at(k);
  408.             buffer_pointer++;
  409.         }
  410.     }
  411.     std::string bit_string = std::bitset<8>((*symbols)[symbols->size() - 1]).to_string();
  412.     for(size_t i = 0; i < last_symbol_significant_bits; ++i){
  413.         bits_buffer[buffer_pointer] = bit_string.at(i);
  414.         buffer_pointer++;
  415.     }
  416.     //PREVIOUS CODE WORKS PERFECT
  417.     /**
  418.      остались ошибки с разбором слова
  419.      нужно фиксить сам алгоритм кодирования и алгоритм разбора
  420.      **/
  421.     make_source_file(output_name, frequences, bits_buffer, bits_buffer_size);
  422.    
  423.     delete frequences;
  424.     delete symbols;
  425.    
  426. }
  427.  
  428. void decompress_file(char* input_name, char* output_name) {
  429.     get_source_file(input_name, output_name);
  430. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement