Advertisement
Guest User

LZW v.6?

a guest
Mar 13th, 2013
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 18.96 KB | None | 0 0
  1. ///
  2. /// @file
  3. /// @author Julius Pettersson
  4. /// @copyright MIT/Expat License.
  5. /// @brief LZW file compressor
  6. /// @version 6
  7. /// @remarks This version borrows heavily from Juha Nieminen's work.
  8. ///
  9. /// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor.
  10. /// It was written with Doxygen comments.
  11. ///
  12. /// @see http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  13. /// @see http://marknelson.us/2011/11/08/lzw-revisited/
  14. /// @see http://www.cs.duke.edu/csed/curious/compression/lzw.html
  15. /// @see http://warp.povusers.org/EfficientLZW/index.html
  16. /// @see http://en.cppreference.com/
  17. /// @see http://www.doxygen.org/
  18. ///
  19. /// @remarks DF: the data file
  20. /// @remarks EF: the encoded file
  21. ///
  22.  
  23. #include <algorithm>
  24. #include <array>
  25. #include <climits>
  26. #include <cstddef>
  27. #include <cstdint>
  28. #include <cstdlib>
  29. #include <exception>
  30. #include <fstream>
  31. #include <ios>
  32. #include <iostream>
  33. #include <istream>
  34. #include <limits>
  35. #include <memory>
  36. #include <ostream>
  37. #include <stdexcept>
  38. #include <string>
  39. #include <utility>
  40. #include <vector>
  41.  
  42. /// Safety macro; if not defined, some overkillish safety checks are avoided.
  43. //#define TAKE_NO_RISKS
  44.  
  45. /// Type used to store and retrieve codes.
  46. using CodeType = std::uint32_t;
  47.  
  48. namespace globals {
  49.  
  50. /// Dictionary Maximum Size (when reached, the dictionary will be reset)
  51. const CodeType dms {16 * 1024 * 1024};
  52.  
  53. } // namespace globals
  54.  
  55. ///
  56. /// @brief Special codes used by the encoder to control the decoder.
  57. /// @todo Metacodes should not be hardcoded to match their index.
  58. ///
  59. enum class MetaCode: CodeType {
  60.     Eof = 1u << CHAR_BIT    ///< End-of-file metacode.
  61. };
  62.  
  63. ///
  64. /// @brief Encoder's custom dictionary type.
  65. ///
  66. class EncoderDictionary {
  67.  
  68.     ///
  69.     /// @brief Binary search tree node.
  70.     ///
  71.     struct Node {
  72.  
  73.         ///
  74.         /// @brief Default constructor.
  75.         /// @param c    byte that the Node will contain
  76.         ///
  77.         explicit Node(char c): first(globals::dms), c(c), left(globals::dms), right(globals::dms)
  78.         {
  79.         }
  80.  
  81.         CodeType    first;  ///< Code of first child string.
  82.         char        c;      ///< Byte.
  83.         CodeType    left;   ///< Code of child node with byte < `c`.
  84.         CodeType    right;  ///< Code of child node with byte > `c`.
  85.     };
  86.  
  87. public:
  88.  
  89.     ///
  90.     /// @brief Default constructor.
  91.     /// @details It builds the `initials` cheat sheet.
  92.     ///
  93.     EncoderDictionary()
  94.     {
  95.         const long int minc = std::numeric_limits<char>::min();
  96.         const long int maxc = std::numeric_limits<char>::max();
  97.         CodeType k {0};
  98.  
  99.         for (long int c = minc; c <= maxc; ++c)
  100.             initials[static_cast<unsigned char> (c)] = k++;
  101.  
  102.         vn.reserve(globals::dms);
  103.         reset();
  104.     }
  105.  
  106.     ///
  107.     /// @brief Resets dictionary to its initial contents.
  108.     /// @note Adds dummy nodes to account for the metacodes.
  109.     ///
  110.     void reset()
  111.     {
  112.         vn.clear();
  113.  
  114.         const long int minc = std::numeric_limits<char>::min();
  115.         const long int maxc = std::numeric_limits<char>::max();
  116.  
  117.         for (long int c = minc; c <= maxc; ++c)
  118.             vn.push_back(Node(c));
  119.  
  120.         // add dummy nodes for the metacodes
  121.         vn.push_back(Node('\x00')); // MetaCode::Eof
  122.     }
  123.  
  124.     ///
  125.     /// @brief Searches for a pair (`i`, `c`) and inserts the pair if it wasn't found.
  126.     /// @param i                code to search for
  127.     /// @param c                attached byte to search for
  128.     /// @return The index of the pair, if it was found.
  129.     /// @retval globals::dms    if the pair wasn't found
  130.     ///
  131.     CodeType search_and_insert(CodeType i, char c)
  132.     {
  133.         if (i == globals::dms)
  134.             return search_initials(c);
  135.  
  136.         const CodeType vn_size = vn.size();
  137.         CodeType ci {vn[i].first}; // Current Index
  138.  
  139.         if (ci != globals::dms)
  140.         {
  141.             while (true)
  142.                 if (c < vn[ci].c)
  143.                 {
  144.                     if (vn[ci].left == globals::dms)
  145.                     {
  146.                         vn[ci].left = vn_size;
  147.                         break;
  148.                     }
  149.                     else
  150.                         ci = vn[ci].left;
  151.                 }
  152.                 else
  153.                 if (c > vn[ci].c)
  154.                 {
  155.                     if (vn[ci].right == globals::dms)
  156.                     {
  157.                         vn[ci].right = vn_size;
  158.                         break;
  159.                     }
  160.                     else
  161.                         ci = vn[ci].right;
  162.                 }
  163.                 else // c == vn[ci].c
  164.                     return ci;
  165.         }
  166.         else
  167.             vn[i].first = vn_size;
  168.  
  169.         vn.push_back(Node(c));
  170.         return globals::dms;
  171.     }
  172.  
  173.     ///
  174.     /// @brief Fakes a search for byte `c` in the one-byte area of the dictionary.
  175.     /// @param c    byte to search for
  176.     /// @return The code associated to the searched byte.
  177.     ///
  178.     CodeType search_initials(char c) const
  179.     {
  180.         return initials[static_cast<unsigned char> (c)];
  181.     }
  182.  
  183.     ///
  184.     /// @brief Returns the number of dictionary entries.
  185.     ///
  186.     std::vector<Node>::size_type size() const
  187.     {
  188.         return vn.size();
  189.     }
  190.  
  191. private:
  192.  
  193.     /// Vector of nodes on top of which the binary search tree is implemented.
  194.     std::vector<Node> vn;
  195.  
  196.     /// Cheat sheet for mapping one-byte strings to their codes.
  197.     std::array<CodeType, 1u << CHAR_BIT> initials;
  198. };
  199.  
  200. ///
  201. /// @brief Helper structure for use in `CodeWriter` and `CodeReader`.
  202. ///
  203. struct ByteCache {
  204.  
  205.     ///
  206.     /// @brief Default constructor.
  207.     ///
  208.     ByteCache(): data(0x00), used(0)
  209.     {
  210.     }
  211.  
  212.     unsigned char   data;   ///< The bits of the cached byte.
  213.     std::size_t     used;   ///< Bits currently in use.
  214. };
  215.  
  216. ///
  217. /// @brief Variable binary width code writer.
  218. ///
  219. class CodeWriter {
  220. public:
  221.  
  222.     ///
  223.     /// @brief Default constructor.
  224.     /// @param [out] os     Output Stream to write codes to
  225.     ///
  226.     explicit CodeWriter(std::ostream &os): os(os), bits(CHAR_BIT + 1)
  227.     {
  228.     }
  229.  
  230.     ///
  231.     /// @brief Destructor.
  232.     /// @note Writes `MetaCode::Eof` and flushes the final byte to the stream by padding it with zeros.
  233.     ///
  234.     ~CodeWriter()
  235.     {
  236.         write(static_cast<CodeType> (MetaCode::Eof));
  237.  
  238.         if (lo.used != 0)
  239.         {
  240.             bits = CHAR_BIT;
  241.             write(0);
  242.         }
  243.     }
  244.  
  245.     ///
  246.     /// @brief Getter for `CodeWriter::bits`.
  247.     ///
  248.     std::size_t get_bits() const
  249.     {
  250.         return bits;
  251.     }
  252.  
  253.     ///
  254.     /// @brief Resets internal binary width.
  255.     /// @note Default value is `CHAR_BIT + 1`.
  256.     ///
  257.     void reset_bits()
  258.     {
  259.         bits = CHAR_BIT + 1;
  260.     }
  261.  
  262.     ///
  263.     /// @brief Increases internal binary width by one.
  264.     /// @throws std::overflow_error     internal binary width cannot be increased
  265.     /// @remarks The exception should never be thrown, under normal circumstances.
  266.     ///
  267.     void increase_bits()
  268.     {
  269. #ifdef TAKE_NO_RISKS
  270.         if (bits == SIZE_MAX)
  271.             throw std::overflow_error("CodeWriter::increase_bits()");
  272. #endif
  273. std::clog << "bits: " << bits << " -> " << bits + 1 << '\n';
  274.         ++bits;
  275.     }
  276.  
  277.     ///
  278.     /// @brief Writes the code `k` with a binary width of `CodeWriter::bits`.
  279.     /// @param k            code to be written
  280.     /// @return Whether or not the stream can be used for output.
  281.     /// @retval true        the output stream can still be used
  282.     /// @retval false       the output stream can no longer be used
  283.     ///
  284.     bool write(CodeType k)
  285.     {
  286.         static const std::array<unsigned char, 9> masks {{0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}};
  287.         std::size_t remaining_bits {bits};
  288.  
  289.         // to hold the incomplete last byte of k
  290.         ByteCache k_last;
  291.  
  292.         // save the tail of k, then remove it from k
  293.         k_last.used = (remaining_bits - (CHAR_BIT - lo.used)) % CHAR_BIT;
  294.         k_last.data = (k & masks[k_last.used]) << (CHAR_BIT - k_last.used);
  295.         k >>= k_last.used;
  296.         remaining_bits -= k_last.used;
  297.  
  298.         // to hold k's full bytes
  299.         static std::vector<char> k_bytes;
  300.  
  301.         k_bytes.clear();
  302.         k_bytes.reserve(sizeof (CodeType));
  303.  
  304.         // to hold k's first byte
  305.         unsigned char k_first {0x00};
  306.  
  307.         while (remaining_bits != 0)
  308.             if (remaining_bits >= CHAR_BIT)
  309.             {
  310.                 k_bytes.push_back(k);
  311.                 k >>= CHAR_BIT;
  312.                 remaining_bits -= CHAR_BIT;
  313.             }
  314.             else
  315.             {
  316.                 k_first = k;
  317.                 remaining_bits = 0;
  318.             }
  319.  
  320.         // k_first only has content if lo.data has too
  321.         if (lo.used != 0)
  322.             os.put(static_cast<char> (lo.data | k_first));
  323.  
  324.         while (!k_bytes.empty())
  325.         {
  326.             os.put(k_bytes.back());
  327.             k_bytes.pop_back();
  328.         }
  329.  
  330.         lo = k_last;
  331.         return os;
  332.     }
  333.  
  334. private:
  335.  
  336.     std::ostream    &os;    ///< Output Stream.
  337.     std::size_t     bits;   ///< Binary width of codes.
  338.     ByteCache       lo;     ///< LeftOvers.
  339. };
  340.  
  341. ///
  342. /// @brief Variable binary width code reader.
  343. ///
  344. class CodeReader {
  345. public:
  346.  
  347.     ///
  348.     /// @brief Default constructor.
  349.     /// @param [in] is      Input Stream to read codes from
  350.     ///
  351.     explicit CodeReader(std::istream &is): is(is), bits(CHAR_BIT + 1), feofmc(false)
  352.     {
  353.     }
  354.  
  355.     ///
  356.     /// @brief Getter for `CodeReader::bits`.
  357.     ///
  358.     std::size_t get_bits() const
  359.     {
  360.         return bits;
  361.     }
  362.  
  363.     ///
  364.     /// @brief Resets internal binary width.
  365.     /// @note Default value is `CHAR_BIT + 1`.
  366.     ///
  367.     void reset_bits()
  368.     {
  369.         bits = CHAR_BIT + 1;
  370.     }
  371.  
  372.     ///
  373.     /// @brief Increases internal binary width by one.
  374.     /// @throws std::overflow_error     if internal binary width cannot be increased
  375.     /// @remarks The exception should never be thrown, under normal circumstances.
  376.     ///
  377.     void increase_bits()
  378.     {
  379. #ifdef TAKE_NO_RISKS
  380.         if (bits == SIZE_MAX)
  381.             throw std::overflow_error("CodeReader::increase_bits()");
  382. #endif
  383. std::clog << "bits: " << bits << " -> " << bits + 1 << '\n';
  384.         ++bits;
  385.     }
  386.  
  387.     ///
  388.     /// @brief Reads the code `k` with a binary width of `CodeReader::bits`.
  389.     /// @param [out] k      code to be read
  390.     /// @return Whether or not the stream can be used for input.
  391.     /// @retval true        the input stream can still be used
  392.     /// @retval false       the input stream can no longer be used
  393.     ///
  394.     bool read(CodeType &k)
  395.     {
  396.         static const std::array<unsigned char, 9> masks {{0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}};
  397.         std::size_t remaining_bits {bits};
  398.         unsigned char temp;
  399.  
  400.         k = lo.data;
  401.         remaining_bits -= lo.used;
  402.         lo.used = 0;
  403.         lo.data = 0x00;
  404.  
  405.         while (remaining_bits != 0 && is.get(reinterpret_cast<char &> (temp)))
  406.             if (remaining_bits >= CHAR_BIT)
  407.             {
  408.                 k <<= CHAR_BIT;
  409.                 k |= temp;
  410.                 remaining_bits -= CHAR_BIT;
  411.             }
  412.             else
  413.             {
  414.                 k <<= remaining_bits;
  415.                 k |= temp >> (CHAR_BIT - remaining_bits);
  416.                 lo.used = CHAR_BIT - remaining_bits;
  417.                 lo.data = temp & masks[lo.used];
  418.                 remaining_bits = 0;
  419.             }
  420.  
  421.         if (k == static_cast<CodeType> (MetaCode::Eof))
  422.         {
  423.             feofmc = true;
  424.             return false;
  425.         }
  426.  
  427.         return is;
  428.     }
  429.  
  430.     ///
  431.     /// @brief Returns if EF is considered corrupted.
  432.     /// @retval true    didn't find end-of-file metacode
  433.     /// @retval false   found end-of-file metacode
  434.     ///
  435.     bool corrupted() const
  436.     {
  437.         return !feofmc;
  438.     }
  439.  
  440. private:
  441.  
  442.     std::istream    &is;    ///< Input Stream.
  443.     std::size_t     bits;   ///< Binary width of codes.
  444.     bool            feofmc; ///< Found End-Of-File MetaCode.
  445.     ByteCache       lo;     ///< LeftOvers.
  446. };
  447.  
  448. ///
  449. /// @brief Computes the minimum number of bits required to store the value of `n`.
  450. /// @param n    number to be evaluated
  451. /// @return Number of required bits.
  452. ///
  453. std::size_t required_bits(unsigned long int n)
  454. {
  455.     std::size_t r {1};
  456.  
  457.     while ((n >>= 1) != 0)
  458.         ++r;
  459.  
  460.     return r;
  461. }
  462.  
  463. ///
  464. /// @brief Compresses the contents of `is` and writes the result to `os`.
  465. /// @param [in] is      input stream
  466. /// @param [out] os     output stream
  467. ///
  468. void compress(std::istream &is, std::ostream &os)
  469. {
  470.     EncoderDictionary ed;
  471.     CodeWriter cw(os);
  472.     CodeType i {globals::dms}; // Index
  473.     char c;
  474.  
  475.     while (is.get(c))
  476.     {
  477.         // dictionary's maximum size was reached
  478.         if (ed.size() == globals::dms)
  479.         {
  480.             ed.reset();
  481.             cw.reset_bits();
  482.         }
  483.  
  484.         const CodeType temp {i};
  485.  
  486.         if ((i = ed.search_and_insert(temp, c)) == globals::dms)
  487.         {
  488.             cw.write(temp);
  489.             i = ed.search_initials(c);
  490.  
  491.             // theoretically, this should be a while() loop
  492.             if (required_bits(ed.size()) > cw.get_bits())
  493.                 cw.increase_bits();
  494.         }
  495.     }
  496.  
  497.     if (i != globals::dms)
  498.         cw.write(i);
  499. }
  500.  
  501. ///
  502. /// @brief Decompresses the contents of `is` and writes the result to `os`.
  503. /// @param [in] is      input stream
  504. /// @param [out] os     output stream
  505. ///
  506. void decompress(std::istream &is, std::ostream &os)
  507. {
  508.     std::vector<std::pair<CodeType, char>> dictionary;
  509.  
  510.     // "named" lambda function, used to reset the dictionary to its initial contents
  511.     const auto reset_dictionary = [&dictionary] {
  512.         dictionary.clear();
  513.         dictionary.reserve(globals::dms);
  514.  
  515.         const long int minc = std::numeric_limits<char>::min();
  516.         const long int maxc = std::numeric_limits<char>::max();
  517.  
  518.         for (long int c = minc; c <= maxc; ++c)
  519.             dictionary.push_back({globals::dms, static_cast<char> (c)});
  520.  
  521.         // add dummy elements for the metacodes
  522.         dictionary.push_back({0, '\x00'}); // MetaCode::Eof
  523.     };
  524.  
  525.     const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
  526.         static std::vector<char> s; // String
  527.  
  528.         s.clear();
  529.  
  530.         // the length of a string cannot exceed the dictionary's number of entries
  531.         s.reserve(globals::dms);
  532.  
  533.         while (k != globals::dms)
  534.         {
  535.             s.push_back(dictionary[k].second);
  536.             k = dictionary[k].first;
  537.         }
  538.  
  539.         std::reverse(s.begin(), s.end());
  540.         return &s;
  541.     };
  542.  
  543.     reset_dictionary();
  544.  
  545.     CodeReader cr(is);
  546.     CodeType i {globals::dms}; // Index
  547.     CodeType k; // Key
  548.  
  549.     while (true)
  550.     {
  551.         // dictionary's maximum size was reached
  552.         if (dictionary.size() == globals::dms)
  553.         {
  554.             reset_dictionary();
  555.             cr.reset_bits();
  556.         }
  557.  
  558.         // theoretically, this should be a while() loop
  559.         if (required_bits(dictionary.size() + 1) > cr.get_bits())
  560.             cr.increase_bits();
  561.  
  562.         if (!cr.read(k))
  563.             break;
  564.  
  565.         if (k > dictionary.size())
  566.             throw std::runtime_error("invalid compressed code");
  567.  
  568.         const std::vector<char> *s; // String
  569.  
  570.         if (k == dictionary.size())
  571.         {
  572.             dictionary.push_back({i, rebuild_string(i)->front()});
  573.             s = rebuild_string(k);
  574.         }
  575.         else
  576.         {
  577.             s = rebuild_string(k);
  578.  
  579.             if (i != globals::dms)
  580.                 dictionary.push_back({i, s->front()});
  581.         }
  582.  
  583.         os.write(&s->front(), s->size());
  584.         i = k;
  585.     }
  586.  
  587.     if (cr.corrupted())
  588.         throw std::runtime_error("corrupted compressed file");
  589. }
  590.  
  591. ///
  592. /// @brief Prints usage information and a custom error message.
  593. /// @param s    custom error message to be printed
  594. /// @param su   Show Usage information
  595. ///
  596. void print_usage(const std::string &s = "", bool su = true)
  597. {
  598.     if (!s.empty())
  599.         std::cerr << "\nERROR: " << s << '\n';
  600.  
  601.     if (su)
  602.     {
  603.         std::cerr << "\nUsage:\n";
  604.         std::cerr << "\tprogram -flag input_file output_file\n\n";
  605.         std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
  606.         std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
  607.         std::cerr << "Examples:\n";
  608.         std::cerr << "\tlzw_v6.exe -c license.txt license.lzw\n";
  609.         std::cerr << "\tlzw_v6.exe -d license.lzw new_license.txt\n";
  610.     }
  611.  
  612.     std::cerr << std::endl;
  613. }
  614.  
  615. ///
  616. /// @brief Actual program entry point.
  617. /// @param argc             number of command line arguments
  618. /// @param [in] argv        array of command line arguments
  619. /// @retval EXIT_FAILURE    for failed operation
  620. /// @retval EXIT_SUCCESS    for successful operation
  621. ///
  622. int main(int argc, char *argv[])
  623. {
  624.     if (argc != 4)
  625.     {
  626.         print_usage("Wrong number of arguments.");
  627.         return EXIT_FAILURE;
  628.     }
  629.  
  630.     enum class Mode {
  631.         Compress,
  632.         Decompress
  633.     };
  634.  
  635.     Mode m;
  636.  
  637.     if (std::string(argv[1]) == "-c")
  638.         m = Mode::Compress;
  639.     else
  640.     if (std::string(argv[1]) == "-d")
  641.         m = Mode::Decompress;
  642.     else
  643.     {
  644.         print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
  645.         return EXIT_FAILURE;
  646.     }
  647.  
  648.     const std::size_t buffer_size {1024 * 1024};
  649.  
  650.     // these custom buffers should be larger than the default ones
  651.     const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
  652.     const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
  653.  
  654.     std::ifstream input_file;
  655.     std::ofstream output_file;
  656.  
  657.     input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
  658.     input_file.open(argv[2], std::ios_base::binary);
  659.  
  660.     if (!input_file.is_open())
  661.     {
  662.         print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
  663.         return EXIT_FAILURE;
  664.     }
  665.  
  666.     output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
  667.     output_file.open(argv[3], std::ios_base::binary);
  668.  
  669.     if (!output_file.is_open())
  670.     {
  671.         print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
  672.         return EXIT_FAILURE;
  673.     }
  674.  
  675.     try
  676.     {
  677.         input_file.exceptions(std::ios_base::badbit);
  678.         output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
  679.  
  680.         if (m == Mode::Compress)
  681.             compress(input_file, output_file);
  682.         else
  683.         if (m == Mode::Decompress)
  684.             decompress(input_file, output_file);
  685.     }
  686.     catch (const std::ios_base::failure &f)
  687.     {
  688.         print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
  689.         return EXIT_FAILURE;
  690.     }
  691.     catch (const std::exception &e)
  692.     {
  693.         print_usage(std::string("Caught exception: ") + e.what() + '.', false);
  694.         return EXIT_FAILURE;
  695.     }
  696.  
  697.     return EXIT_SUCCESS;
  698. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement