Advertisement
Guest User

LZW v.6

a guest
Mar 14th, 2013
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 19.30 KB | None | 0 0
  1. ///
  2. /// @file
  3. /// @author Julius Pettersson
  4. /// @copyright MIT/Expat License.
  5. /// @brief LZW file compressor
  6. /// @version 6
  7. /// @remarks This version borrows heavily from Juha Nieminen's work.
  8. ///
  9. /// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor.
  10. /// It was written with Doxygen comments.
  11. ///
  12. /// @see http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  13. /// @see http://marknelson.us/2011/11/08/lzw-revisited/
  14. /// @see http://www.cs.duke.edu/csed/curious/compression/lzw.html
  15. /// @see http://warp.povusers.org/EfficientLZW/index.html
  16. /// @see http://en.cppreference.com/
  17. /// @see http://www.doxygen.org/
  18. ///
  19. /// @remarks DF: the data file
  20. /// @remarks EF: the encoded file
  21. ///
  22.  
  23. #include <algorithm>
  24. #include <array>
  25. #include <climits>
  26. #include <cstddef>
  27. #include <cstdint>
  28. #include <cstdlib>
  29. #include <exception>
  30. #include <fstream>
  31. #include <ios>
  32. #include <iostream>
  33. #include <istream>
  34. #include <limits>
  35. #include <memory>
  36. #include <ostream>
  37. #include <stack>
  38. #include <stdexcept>
  39. #include <string>
  40. #include <utility>
  41. #include <vector>
  42.  
  43. /// Safety macro; if not defined, some overkillish safety checks are avoided.
  44. //#define TAKE_NO_RISKS
  45.  
  46. /// Type used to store and retrieve codes.
  47. using CodeType = std::uint32_t;
  48.  
  49. namespace globals {
  50.  
  51. /// Dictionary Maximum Size (when reached, the dictionary will be reset)
  52. //const CodeType dms {16 * 1024 * 1024};
  53. const CodeType dms {1024}; // small dictionary size for debugging purposes
  54.  
  55.  
  56. /// Ready-made bit masks for use in `CodeWriter::write()` and `CodeReader::read()`.
  57. const std::array<unsigned char, 9> masks {{0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}};
  58.  
  59. } // namespace globals
  60.  
  61. ///
  62. /// @brief Special codes used by the encoder to control the decoder.
  63. /// @todo Metacodes should not be hardcoded to match their index.
  64. ///
  65. enum class MetaCode: CodeType {
  66.     Eof = 1u << CHAR_BIT,   ///< End-of-file.
  67.     Reset                   ///< Reset dictionary and code width.
  68. };
  69.  
  70. ///
  71. /// @brief Encoder's custom dictionary type.
  72. ///
  73. class EncoderDictionary {
  74.  
  75.     ///
  76.     /// @brief Binary search tree node.
  77.     ///
  78.     struct Node {
  79.  
  80.         ///
  81.         /// @brief Default constructor.
  82.         /// @param c    byte that the Node will contain
  83.         ///
  84.         explicit Node(char c): first(globals::dms), c(c), left(globals::dms), right(globals::dms)
  85.         {
  86.         }
  87.  
  88.         CodeType    first;  ///< Code of first child string.
  89.         char        c;      ///< Byte.
  90.         CodeType    left;   ///< Code of child node with byte < `c`.
  91.         CodeType    right;  ///< Code of child node with byte > `c`.
  92.     };
  93.  
  94. public:
  95.  
  96.     ///
  97.     /// @brief Default constructor.
  98.     /// @details It builds the `initials` cheat sheet.
  99.     ///
  100.     EncoderDictionary()
  101.     {
  102.         const long int minc = std::numeric_limits<char>::min();
  103.         const long int maxc = std::numeric_limits<char>::max();
  104.         CodeType k {0};
  105.  
  106.         for (long int c = minc; c <= maxc; ++c)
  107.             initials[static_cast<unsigned char> (c)] = k++;
  108.  
  109.         vn.reserve(globals::dms);
  110.         reset();
  111.     }
  112.  
  113.     ///
  114.     /// @brief Resets dictionary to its initial contents.
  115.     /// @note Adds dummy nodes to account for the metacodes.
  116.     ///
  117.     void reset()
  118.     {
  119.         vn.clear();
  120.  
  121.         const long int minc = std::numeric_limits<char>::min();
  122.         const long int maxc = std::numeric_limits<char>::max();
  123.  
  124.         for (long int c = minc; c <= maxc; ++c)
  125.             vn.push_back(Node(c));
  126.  
  127.         // add dummy nodes for the metacodes
  128.         vn.push_back(Node('\x00')); // MetaCode::Eof
  129.         vn.push_back(Node('\x00')); // MetaCode::Reset
  130.     }
  131.  
  132.     ///
  133.     /// @brief Searches for a pair (`i`, `c`) and inserts the pair if it wasn't found.
  134.     /// @param i                code to search for
  135.     /// @param c                attached byte to search for
  136.     /// @return The index of the pair, if it was found.
  137.     /// @retval globals::dms    if the pair wasn't found
  138.     ///
  139.     CodeType search_and_insert(CodeType i, char c)
  140.     {
  141.         if (i == globals::dms)
  142.             return search_initials(c);
  143.  
  144.         const CodeType vn_size = vn.size();
  145.         CodeType ci {vn[i].first}; // Current Index
  146.  
  147.         if (ci != globals::dms)
  148.         {
  149.             while (true)
  150.                 if (c < vn[ci].c)
  151.                 {
  152.                     if (vn[ci].left == globals::dms)
  153.                     {
  154.                         vn[ci].left = vn_size;
  155.                         break;
  156.                     }
  157.                     else
  158.                         ci = vn[ci].left;
  159.                 }
  160.                 else
  161.                 if (c > vn[ci].c)
  162.                 {
  163.                     if (vn[ci].right == globals::dms)
  164.                     {
  165.                         vn[ci].right = vn_size;
  166.                         break;
  167.                     }
  168.                     else
  169.                         ci = vn[ci].right;
  170.                 }
  171.                 else // c == vn[ci].c
  172.                     return ci;
  173.         }
  174.         else
  175.             vn[i].first = vn_size;
  176.  
  177.         vn.push_back(Node(c));
  178.         return globals::dms;
  179.     }
  180.  
  181.     ///
  182.     /// @brief Fakes a search for byte `c` in the one-byte area of the dictionary.
  183.     /// @param c    byte to search for
  184.     /// @return The code associated to the searched byte.
  185.     ///
  186.     CodeType search_initials(char c) const
  187.     {
  188.         return initials[static_cast<unsigned char> (c)];
  189.     }
  190.  
  191.     ///
  192.     /// @brief Returns the number of dictionary entries.
  193.     ///
  194.     std::vector<Node>::size_type size() const
  195.     {
  196.         return vn.size();
  197.     }
  198.  
  199. private:
  200.  
  201.     /// Vector of nodes on top of which the binary search tree is implemented.
  202.     std::vector<Node> vn;
  203.  
  204.     /// Cheat sheet for mapping one-byte strings to their codes.
  205.     std::array<CodeType, 1u << CHAR_BIT> initials;
  206. };
  207.  
  208. ///
  209. /// @brief Helper structure for use in `CodeWriter` and `CodeReader`.
  210. ///
  211. struct ByteCache {
  212.  
  213.     ///
  214.     /// @brief Default constructor.
  215.     ///
  216.     ByteCache(): data(0x00), used(0)
  217.     {
  218.     }
  219.  
  220.     unsigned char   data;   ///< The bits of the cached byte.
  221.     std::size_t     used;   ///< Bits currently in use.
  222. };
  223.  
  224. ///
  225. /// @brief Variable binary width code writer.
  226. ///
  227. class CodeWriter {
  228. public:
  229.  
  230.     ///
  231.     /// @brief Default constructor.
  232.     /// @param [out] os     Output Stream to write codes to
  233.     ///
  234.     explicit CodeWriter(std::ostream &os): os(os), bits(CHAR_BIT + 1)
  235.     {
  236.     }
  237.  
  238.     ///
  239.     /// @brief Destructor.
  240.     /// @note Writes `MetaCode::Eof` and flushes the final byte to the stream by padding it with zeros.
  241.     ///
  242.     ~CodeWriter()
  243.     {
  244.         write(static_cast<CodeType> (MetaCode::Eof));
  245.  
  246.         if (lo.used != 0)
  247.         {
  248.             bits = CHAR_BIT;
  249.             write(0);
  250.         }
  251.     }
  252.  
  253.     ///
  254.     /// @brief Getter for `CodeWriter::bits`.
  255.     ///
  256.     std::size_t get_bits() const
  257.     {
  258.         return bits;
  259.     }
  260.  
  261.     ///
  262.     /// @brief Resets internal binary width.
  263.     /// @note Default value is `CHAR_BIT + 1`.
  264.     ///
  265.     void reset_bits()
  266.     {
  267.         bits = CHAR_BIT + 1;
  268.     }
  269.  
  270.     ///
  271.     /// @brief Increases internal binary width by one.
  272.     /// @throws std::overflow_error     internal binary width cannot be increased
  273.     /// @remarks The exception should never be thrown, under normal circumstances.
  274.     ///
  275.     void increase_bits()
  276.     {
  277. #ifdef TAKE_NO_RISKS
  278.         if (bits == SIZE_MAX)
  279.             throw std::overflow_error("CodeWriter::increase_bits()");
  280. #endif
  281. std::clog << "bits: " << bits << " -> " << bits + 1 << '\n';
  282.         ++bits;
  283.     }
  284.  
  285.     ///
  286.     /// @brief Writes the code `k` with a binary width of `CodeWriter::bits`.
  287.     /// @param k            code to be written
  288.     /// @return Whether or not the stream can be used for output.
  289.     /// @retval true        the output stream can still be used
  290.     /// @retval false       the output stream can no longer be used
  291.     ///
  292.     bool write(CodeType k)
  293.     {
  294.         std::size_t remaining_bits {bits};
  295.  
  296.         // to hold the incomplete last byte of k
  297.         ByteCache k_last;
  298.  
  299.         // save the tail of k, then remove it from k
  300.         k_last.used = (remaining_bits - (CHAR_BIT - lo.used)) % CHAR_BIT;
  301.         k_last.data = (k & globals::masks[k_last.used]) << (CHAR_BIT - k_last.used);
  302.         k >>= k_last.used;
  303.         remaining_bits -= k_last.used;
  304.  
  305.         // to hold k's full bytes
  306.         static std::stack<char, std::vector<char>> k_bytes;
  307.  
  308.         // to hold k's first byte
  309.         unsigned char k_first {0x00};
  310.  
  311.         while (remaining_bits != 0)
  312.             if (remaining_bits >= CHAR_BIT)
  313.             {
  314.                 k_bytes.push(k);
  315.                 k >>= CHAR_BIT;
  316.                 remaining_bits -= CHAR_BIT;
  317.             }
  318.             else
  319.             {
  320.                 k_first = k;
  321.                 remaining_bits = 0;
  322.             }
  323.  
  324.         // k_first only has content if lo.data has too
  325.         if (lo.used != 0)
  326.             os.put(static_cast<char> (lo.data | k_first));
  327.  
  328.         while (!k_bytes.empty())
  329.         {
  330.             os.put(k_bytes.top());
  331.             k_bytes.pop();
  332.         }
  333.  
  334.         lo = k_last;
  335.         return os;
  336.     }
  337.  
  338. private:
  339.  
  340.     std::ostream    &os;    ///< Output Stream.
  341.     std::size_t     bits;   ///< Binary width of codes.
  342.     ByteCache       lo;     ///< LeftOvers.
  343. };
  344.  
  345. ///
  346. /// @brief Variable binary width code reader.
  347. ///
  348. class CodeReader {
  349. public:
  350.  
  351.     ///
  352.     /// @brief Default constructor.
  353.     /// @param [in] is      Input Stream to read codes from
  354.     ///
  355.     explicit CodeReader(std::istream &is): is(is), bits(CHAR_BIT + 1), feofmc(false)
  356.     {
  357.     }
  358.  
  359.     ///
  360.     /// @brief Getter for `CodeReader::bits`.
  361.     ///
  362.     std::size_t get_bits() const
  363.     {
  364.         return bits;
  365.     }
  366.  
  367.     ///
  368.     /// @brief Resets internal binary width.
  369.     /// @note Default value is `CHAR_BIT + 1`.
  370.     ///
  371.     void reset_bits()
  372.     {
  373.         bits = CHAR_BIT + 1;
  374.     }
  375.  
  376.     ///
  377.     /// @brief Increases internal binary width by one.
  378.     /// @throws std::overflow_error     if internal binary width cannot be increased
  379.     /// @remarks The exception should never be thrown, under normal circumstances.
  380.     ///
  381.     void increase_bits()
  382.     {
  383. #ifdef TAKE_NO_RISKS
  384.         if (bits == SIZE_MAX)
  385.             throw std::overflow_error("CodeReader::increase_bits()");
  386. #endif
  387. std::clog << "bits: " << bits << " -> " << bits + 1 << '\n';
  388.         ++bits;
  389.     }
  390.  
  391.     ///
  392.     /// @brief Reads the code `k` with a binary width of `CodeReader::bits`.
  393.     /// @param [out] k      code to be read
  394.     /// @return Whether or not the stream can be used for input.
  395.     /// @retval true        the input stream can still be used
  396.     /// @retval false       the input stream can no longer be used
  397.     ///
  398.     bool read(CodeType &k)
  399.     {
  400.         std::size_t remaining_bits {bits};
  401.         unsigned char temp;
  402.  
  403.         k = lo.data;
  404.         remaining_bits -= lo.used;
  405.         lo.used = 0;
  406.         lo.data = 0x00;
  407.  
  408.         while (remaining_bits != 0 && is.get(reinterpret_cast<char &> (temp)))
  409.             if (remaining_bits >= CHAR_BIT)
  410.             {
  411.                 k <<= CHAR_BIT;
  412.                 k |= temp;
  413.                 remaining_bits -= CHAR_BIT;
  414.             }
  415.             else
  416.             {
  417.                 k <<= remaining_bits;
  418.                 k |= temp >> (CHAR_BIT - remaining_bits);
  419.                 lo.used = CHAR_BIT - remaining_bits;
  420.                 lo.data = temp & globals::masks[lo.used];
  421.                 remaining_bits = 0;
  422.             }
  423.  
  424.         if (k == static_cast<CodeType> (MetaCode::Eof))
  425.         {
  426. std::clog << "FOUND EOF!\n";
  427.             feofmc = true;
  428.             return false;
  429.         }
  430.  
  431.         return is;
  432.     }
  433.  
  434.     ///
  435.     /// @brief Returns if EF is considered corrupted.
  436.     /// @retval true    didn't find end-of-file metacode
  437.     /// @retval false   found end-of-file metacode
  438.     ///
  439.     bool corrupted() const
  440.     {
  441.         return !feofmc;
  442.     }
  443.  
  444. private:
  445.  
  446.     std::istream    &is;    ///< Input Stream.
  447.     std::size_t     bits;   ///< Binary width of codes.
  448.     bool            feofmc; ///< Found End-Of-File MetaCode.
  449.     ByteCache       lo;     ///< LeftOvers.
  450. };
  451.  
  452. ///
  453. /// @brief Computes the minimum number of bits required to store the value of `n`.
  454. /// @param n    number to be evaluated
  455. /// @return Number of required bits.
  456. ///
  457. std::size_t required_bits(unsigned long int n)
  458. {
  459.     std::size_t r {1};
  460.  
  461.     while ((n >>= 1) != 0)
  462.         ++r;
  463.  
  464.     return r;
  465. }
  466.  
  467. ///
  468. /// @brief Compresses the contents of `is` and writes the result to `os`.
  469. /// @param [in] is      input stream
  470. /// @param [out] os     output stream
  471. ///
  472. void compress(std::istream &is, std::ostream &os)
  473. {
  474.     EncoderDictionary ed;
  475.     CodeWriter cw(os);
  476.     CodeType i {globals::dms}; // Index
  477.     char c;
  478.  
  479.     while (is.get(c))
  480.     {
  481.         // dictionary's maximum size was reached
  482.         if (ed.size() == globals::dms)
  483.         {
  484.             cw.write(static_cast<CodeType> (MetaCode::Reset));
  485.             ed.reset();
  486.             cw.reset_bits();
  487. std::clog << "DICTIONARY AND BITS RESET!\n";
  488.         }
  489.  
  490.         const CodeType temp {i};
  491.  
  492.         if ((i = ed.search_and_insert(temp, c)) == globals::dms)
  493.         {
  494.             if (required_bits(ed.size()) > cw.get_bits())
  495.                 cw.increase_bits();
  496.  
  497.             cw.write(temp);
  498.             i = ed.search_initials(c);
  499.         }
  500.     }
  501.  
  502.     if (i != globals::dms)
  503.         cw.write(i);
  504. }
  505.  
  506. ///
  507. /// @brief Decompresses the contents of `is` and writes the result to `os`.
  508. /// @param [in] is      input stream
  509. /// @param [out] os     output stream
  510. ///
  511. void decompress(std::istream &is, std::ostream &os)
  512. {
  513.     std::vector<std::pair<CodeType, char>> dictionary;
  514.  
  515.     // "named" lambda function, used to reset the dictionary to its initial contents
  516.     const auto reset_dictionary = [&dictionary] {
  517.         dictionary.clear();
  518.         dictionary.reserve(globals::dms);
  519.  
  520.         const long int minc = std::numeric_limits<char>::min();
  521.         const long int maxc = std::numeric_limits<char>::max();
  522.  
  523.         for (long int c = minc; c <= maxc; ++c)
  524.             dictionary.push_back({globals::dms, static_cast<char> (c)});
  525.  
  526.         // add dummy elements for the metacodes
  527.         dictionary.push_back({0, '\x00'}); // MetaCode::Eof
  528.         dictionary.push_back({0, '\x00'}); // MetaCode::Reset
  529.     };
  530.  
  531.     const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
  532.         static std::vector<char> s; // String
  533.  
  534.         s.clear();
  535.  
  536.         // the length of a string cannot exceed the dictionary's number of entries
  537.         s.reserve(globals::dms);
  538.  
  539.         while (k != globals::dms)
  540.         {
  541.             s.push_back(dictionary[k].second);
  542.             k = dictionary[k].first;
  543.         }
  544.  
  545.         std::reverse(s.begin(), s.end());
  546.         return &s;
  547.     };
  548.  
  549.     reset_dictionary();
  550.  
  551.     CodeReader cr(is);
  552.     CodeType i {globals::dms}; // Index
  553.     CodeType k; // Key
  554.  
  555.     while (true)
  556.     {
  557.         if (required_bits(dictionary.size() + 1) > cr.get_bits())
  558.             cr.increase_bits();
  559.  
  560.         if (!cr.read(k))
  561.             break;
  562.  
  563.         // the reset metacode was read
  564.         if (k == static_cast<CodeType> (MetaCode::Reset))
  565.         {
  566.             reset_dictionary();
  567.             cr.reset_bits();
  568. std::clog << "DICTIONARY AND BITS RESET!\n";
  569.             continue;
  570.         }
  571.  
  572.         if (k > dictionary.size())
  573. {
  574. std::clog << "d size: " << dictionary.size() << "\ticc: " << k << '\n';
  575.             throw std::runtime_error("invalid compressed code");
  576. }
  577.  
  578.         const std::vector<char> *s; // String
  579.  
  580.         if (k == dictionary.size())
  581.         {
  582.             dictionary.push_back({i, rebuild_string(i)->front()});
  583.             s = rebuild_string(k);
  584.         }
  585.         else
  586.         {
  587.             s = rebuild_string(k);
  588.  
  589.             if (i != globals::dms)
  590.                 dictionary.push_back({i, s->front()});
  591.         }
  592.  
  593.         os.write(&s->front(), s->size());
  594.         i = k;
  595.     }
  596.  
  597.     if (cr.corrupted())
  598.         throw std::runtime_error("corrupted compressed file");
  599. }
  600.  
  601. ///
  602. /// @brief Prints usage information and a custom error message.
  603. /// @param s    custom error message to be printed
  604. /// @param su   Show Usage information
  605. ///
  606. void print_usage(const std::string &s = "", bool su = true)
  607. {
  608.     if (!s.empty())
  609.         std::cerr << "\nERROR: " << s << '\n';
  610.  
  611.     if (su)
  612.     {
  613.         std::cerr << "\nUsage:\n";
  614.         std::cerr << "\tprogram -flag input_file output_file\n\n";
  615.         std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
  616.         std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
  617.         std::cerr << "Examples:\n";
  618.         std::cerr << "\tlzw_v6.exe -c license.txt license.lzw\n";
  619.         std::cerr << "\tlzw_v6.exe -d license.lzw new_license.txt\n";
  620.     }
  621.  
  622.     std::cerr << std::endl;
  623. }
  624.  
  625. ///
  626. /// @brief Actual program entry point.
  627. /// @param argc             number of command line arguments
  628. /// @param [in] argv        array of command line arguments
  629. /// @retval EXIT_FAILURE    for failed operation
  630. /// @retval EXIT_SUCCESS    for successful operation
  631. ///
  632. int main(int argc, char *argv[])
  633. {
  634.     if (argc != 4)
  635.     {
  636.         print_usage("Wrong number of arguments.");
  637.         return EXIT_FAILURE;
  638.     }
  639.  
  640.     enum class Mode {
  641.         Compress,
  642.         Decompress
  643.     };
  644.  
  645.     Mode m;
  646.  
  647.     if (std::string(argv[1]) == "-c")
  648.         m = Mode::Compress;
  649.     else
  650.     if (std::string(argv[1]) == "-d")
  651.         m = Mode::Decompress;
  652.     else
  653.     {
  654.         print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
  655.         return EXIT_FAILURE;
  656.     }
  657.  
  658.     const std::size_t buffer_size {1024 * 1024};
  659.  
  660.     // these custom buffers should be larger than the default ones
  661.     const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
  662.     const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
  663.  
  664.     std::ifstream input_file;
  665.     std::ofstream output_file;
  666.  
  667.     input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
  668.     input_file.open(argv[2], std::ios_base::binary);
  669.  
  670.     if (!input_file.is_open())
  671.     {
  672.         print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
  673.         return EXIT_FAILURE;
  674.     }
  675.  
  676.     output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
  677.     output_file.open(argv[3], std::ios_base::binary);
  678.  
  679.     if (!output_file.is_open())
  680.     {
  681.         print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
  682.         return EXIT_FAILURE;
  683.     }
  684.  
  685.     try
  686.     {
  687.         input_file.exceptions(std::ios_base::badbit);
  688.         output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
  689.  
  690.         if (m == Mode::Compress)
  691.             compress(input_file, output_file);
  692.         else
  693.         if (m == Mode::Decompress)
  694.             decompress(input_file, output_file);
  695.     }
  696.     catch (const std::ios_base::failure &f)
  697.     {
  698.         print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
  699.         return EXIT_FAILURE;
  700.     }
  701.     catch (const std::exception &e)
  702.     {
  703.         print_usage(std::string("Caught exception: ") + e.what() + '.', false);
  704.         return EXIT_FAILURE;
  705.     }
  706.  
  707.     return EXIT_SUCCESS;
  708. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement