Advertisement
Guest User

LZW v.6-

a guest
Mar 12th, 2013
44
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 17.38 KB | None | 0 0
  1. ///
  2. /// @file
  3. /// @author Julius Pettersson
  4. /// @copyright MIT/Expat License.
  5. /// @brief LZW file compressor
  6. /// @version 6
  7. /// @remarks This version borrows heavily from Juha Nieminen's work.
  8. ///
  9. /// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor.
  10. /// It was written with Doxygen comments.
  11. ///
  12. /// @see http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  13. /// @see http://marknelson.us/2011/11/08/lzw-revisited/
  14. /// @see http://www.cs.duke.edu/csed/curious/compression/lzw.html
  15. /// @see http://warp.povusers.org/EfficientLZW/index.html
  16. /// @see http://en.cppreference.com/
  17. /// @see http://www.doxygen.org/
  18. ///
  19. /// @remarks DF: the data file
  20. /// @remarks EF: the encoded file
  21. ///
  22.  
  23. #include <algorithm>
  24. #include <array>
  25. #include <climits>
  26. #include <cstddef>
  27. #include <cstdint>
  28. #include <cstdlib>
  29. #include <exception>
  30. #include <fstream>
  31. #include <ios>
  32. #include <iostream>
  33. #include <istream>
  34. #include <limits>
  35. #include <memory>
  36. #include <ostream>
  37. #include <stdexcept>
  38. #include <string>
  39. #include <utility>
  40. #include <vector>
  41.  
  42. /// Type used to store and retrieve codes.
  43. using CodeType = std::uint32_t;
  44.  
  45. namespace globals {
  46.  
  47. /// Dictionary Maximum Size (when reached, the dictionary will be reset)
  48. const CodeType dms {16 * 1024 * 1024};
  49.  
  50. } // namespace globals
  51.  
  52. ///
  53. /// @brief Special codes used by the encoder to control the decoder.
  54. /// @todo Metacodes should not be hardcoded to match their index.
  55. ///
  56. enum class MetaCode: CodeType {
  57.     Eof = 1u << CHAR_BIT    ///< End-of-file metacode.
  58. };
  59.  
  60. ///
  61. /// @brief Encoder's custom dictionary type.
  62. ///
  63. class EncoderDictionary {
  64.  
  65.     ///
  66.     /// @brief Binary search tree node.
  67.     ///
  68.     struct Node {
  69.  
  70.         ///
  71.         /// @brief Default constructor.
  72.         /// @param c    byte that the Node will contain
  73.         ///
  74.         explicit Node(char c): first(globals::dms), c(c), left(globals::dms), right(globals::dms)
  75.         {
  76.         }
  77.  
  78.         CodeType    first;  ///< Code of first child string.
  79.         char        c;      ///< Byte.
  80.         CodeType    left;   ///< Code of child node with byte < `c`.
  81.         CodeType    right;  ///< Code of child node with byte > `c`.
  82.     };
  83.  
  84. public:
  85.  
  86.     ///
  87.     /// @brief Default constructor.
  88.     /// @details It builds the `initials` cheat sheet.
  89.     ///
  90.     EncoderDictionary()
  91.     {
  92.         const long int minc = std::numeric_limits<char>::min();
  93.         const long int maxc = std::numeric_limits<char>::max();
  94.         CodeType k {0};
  95.  
  96.         for (long int c = minc; c <= maxc; ++c)
  97.             initials[static_cast<unsigned char> (c)] = k++;
  98.  
  99.         vn.reserve(globals::dms);
  100.         reset();
  101.     }
  102.  
  103.     ///
  104.     /// @brief Resets dictionary to its initial contents.
  105.     /// @note Adds dummy nodes to account for the metacodes.
  106.     ///
  107.     void reset()
  108.     {
  109.         vn.clear();
  110.  
  111.         const long int minc = std::numeric_limits<char>::min();
  112.         const long int maxc = std::numeric_limits<char>::max();
  113.  
  114.         for (long int c = minc; c <= maxc; ++c)
  115.             vn.push_back(Node(c));
  116.  
  117.         // add dummy nodes for the metacodes
  118.         vn.push_back(Node('\x00')); // MetaCode::Eof
  119.     }
  120.  
  121.     ///
  122.     /// @brief Searches for a pair (`i`, `c`) and inserts the pair if it wasn't found.
  123.     /// @param i                code to search for
  124.     /// @param c                attached byte to search for
  125.     /// @return The index of the pair, if it was found.
  126.     /// @retval globals::dms    if the pair wasn't found
  127.     ///
  128.     CodeType search_and_insert(CodeType i, char c)
  129.     {
  130.         if (i == globals::dms)
  131.             return search_initials(c);
  132.  
  133.         const CodeType vn_size = vn.size();
  134.         CodeType ci {vn[i].first}; // Current Index
  135.  
  136.         if (ci != globals::dms)
  137.         {
  138.             while (true)
  139.                 if (c < vn[ci].c)
  140.                 {
  141.                     if (vn[ci].left == globals::dms)
  142.                     {
  143.                         vn[ci].left = vn_size;
  144.                         break;
  145.                     }
  146.                     else
  147.                         ci = vn[ci].left;
  148.                 }
  149.                 else
  150.                 if (c > vn[ci].c)
  151.                 {
  152.                     if (vn[ci].right == globals::dms)
  153.                     {
  154.                         vn[ci].right = vn_size;
  155.                         break;
  156.                     }
  157.                     else
  158.                         ci = vn[ci].right;
  159.                 }
  160.                 else // c == vn[ci].c
  161.                     return ci;
  162.         }
  163.         else
  164.             vn[i].first = vn_size;
  165.  
  166.         vn.push_back(Node(c));
  167.         return globals::dms;
  168.     }
  169.  
  170.     ///
  171.     /// @brief Fakes a search for byte `c` in the one-byte area of the dictionary.
  172.     /// @param c    byte to search for
  173.     /// @return The code associated to the searched byte.
  174.     ///
  175.     CodeType search_initials(char c) const
  176.     {
  177.         return initials[static_cast<unsigned char> (c)];
  178.     }
  179.  
  180.     ///
  181.     /// @brief Returns the number of dictionary entries.
  182.     ///
  183.     std::vector<Node>::size_type size() const
  184.     {
  185.         return vn.size();
  186.     }
  187.  
  188. private:
  189.  
  190.     /// Vector of nodes on top of which the binary search tree is implemented.
  191.     std::vector<Node> vn;
  192.  
  193.     /// Cheat sheet for mapping one-byte strings to their codes.
  194.     std::array<CodeType, 1u << CHAR_BIT> initials;
  195. };
  196.  
  197. ///
  198. /// @brief Variable binary width code writer.
  199. ///
  200. class CodeWriter {
  201. public:
  202.  
  203.     ///
  204.     /// @brief Default constructor.
  205.     /// @param [out] os     Output Stream to write codes to
  206.     ///
  207.     explicit CodeWriter(std::ostream &os): os(os), bits(CHAR_BIT + 1)
  208.     {
  209.     }
  210.  
  211.     ///
  212.     /// @brief Destructor.
  213.     /// @note Writes `MetaCode::Eof` and flushes the final byte to the stream by padding it with zeros.
  214.     ///
  215.     ~CodeWriter()
  216.     {
  217.         write(static_cast<CodeType> (MetaCode::Eof));
  218.  
  219.         if (lo.second != 0)
  220.         {
  221.             bits = CHAR_BIT;
  222.             write(0);
  223.         }
  224.     }
  225.  
  226.     ///
  227.     /// @brief Getter for `CodeWriter::bits`.
  228.     ///
  229.     std::size_t get_bits() const
  230.     {
  231.         return bits;
  232.     }
  233.  
  234.     ///
  235.     /// @brief Resets internal binary width.
  236.     /// @note Default value is `CHAR_BIT + 1`.
  237.     ///
  238.     void reset_bits()
  239.     {
  240.         bits = CHAR_BIT + 1;
  241.     }
  242.  
  243.     ///
  244.     /// @brief Increases internal binary width by one.
  245.     /// @throws std::overflow_error     internal binary width cannot be increased
  246.     /// @remarks The exception should never be thrown, under normal circumstances.
  247.     ///
  248.     void increase_bits()
  249.     {
  250.         if (bits == SIZE_MAX)
  251.             throw std::overflow_error("CodeWriter::increase_bits()");
  252.  
  253.         ++bits;
  254.     }
  255.  
  256.     ///
  257.     /// @brief Writes the code `k` with a binary width of `CodeWriter::bits`.
  258.     /// @param k            code to be written
  259.     /// @return Whether or not the stream can be used for output.
  260.     /// @retval true        the output stream can still be used
  261.     /// @retval false       the output stream can no longer be used
  262.     ///
  263.     bool write(CodeType k)
  264.     {
  265.         std::size_t local_bits {bits};
  266.  
  267.         while (local_bits != 0)
  268.         {
  269.             lo.first |= k << lo.second;
  270.             k >>= CHAR_BIT - lo.second;
  271.  
  272.             if (lo.second + local_bits >= CHAR_BIT)
  273.             {
  274.                 local_bits -= CHAR_BIT - lo.second;
  275.                 os.put(static_cast<char> (lo.first));
  276.                 lo = {'\x00', 0};
  277.             }
  278.             else
  279.             {
  280.                 lo.second += local_bits;
  281.                 local_bits = 0;
  282.             }
  283.         }
  284.  
  285.         return os;
  286.     }
  287.  
  288. private:
  289.  
  290.     std::ostream &os; ///< Output Stream.
  291.     std::size_t bits; ///< Binary width of codes.
  292.     std::pair<unsigned char, std::size_t> lo; ///< LeftOvers.
  293. };
  294.  
  295. ///
  296. /// @brief Variable binary width code reader.
  297. ///
  298. class CodeReader {
  299. public:
  300.  
  301.     ///
  302.     /// @brief Default constructor.
  303.     /// @param [in] is      Input Stream to read codes from
  304.     ///
  305.     explicit CodeReader(std::istream &is): is(is), bits(CHAR_BIT + 1), feofmc(false)
  306.     {
  307.     }
  308.  
  309.     ///
  310.     /// @brief Getter for `CodeReader::bits`.
  311.     ///
  312.     std::size_t get_bits() const
  313.     {
  314.         return bits;
  315.     }
  316.  
  317.     ///
  318.     /// @brief Resets internal binary width.
  319.     /// @note Default value is `CHAR_BIT + 1`.
  320.     ///
  321.     void reset_bits()
  322.     {
  323.         bits = CHAR_BIT + 1;
  324.     }
  325.  
  326.     ///
  327.     /// @brief Increases internal binary width by one.
  328.     /// @throws std::overflow_error     if internal binary width cannot be increased
  329.     /// @remarks The exception should never be thrown, under normal circumstances.
  330.     ///
  331.     void increase_bits()
  332.     {
  333.         if (bits == SIZE_MAX)
  334.             throw std::overflow_error("CodeReader::increase_bits()");
  335.  
  336.         ++bits;
  337.     }
  338.  
  339.     ///
  340.     /// @brief Reads the code `k` with a binary width of `CodeReader::bits`.
  341.     /// @param [out] k      code to be read
  342.     /// @return Whether or not the stream can be used for input.
  343.     /// @retval true        the input stream can still be used
  344.     /// @retval false       the input stream can no longer be used
  345.     ///
  346.     bool read(CodeType &k)
  347.     {
  348.         k=0;
  349.  
  350.         std::size_t local_bits {bits};
  351.         std::size_t k_count {0};
  352.  
  353.         // to prevent computing the required mask
  354.         const std::array<unsigned char, 9> masks {{0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}};
  355.  
  356.         while (local_bits != 0)
  357.         {
  358.             const std::size_t n = std::min(lo.second, local_bits);
  359.  
  360.             k |= static_cast<CodeType> (lo.first & masks[n]) << k_count;
  361.             k_count += n;
  362.             local_bits -= n;
  363.             lo.second -= n;
  364.             lo.first >>= n;
  365.  
  366.             if (is.get(reinterpret_cast<char &> (lo.first)))
  367.                 break;
  368.  
  369.             lo.second = CHAR_BIT;
  370.         }
  371.  
  372.         if (k == static_cast<CodeType> (MetaCode::Eof))
  373.         {
  374.             feofmc = true;
  375.             return false;
  376.         }
  377.  
  378.         return is;
  379.     }
  380.  
  381.     ///
  382.     /// @brief Returns if EF is considered corrupted.
  383.     /// @retval true    didn't find end-of-file metacode
  384.     /// @retval false   found end-of-file metacode
  385.     ///
  386.     bool corrupted() const
  387.     {
  388.         return !feofmc;
  389.     }
  390.  
  391. private:
  392.  
  393.     std::istream &is; ///< Input Stream.
  394.     std::size_t bits; ///< Binary width of codes.
  395.     bool feofmc; ///< Found End-Of-File MetaCode.
  396.     std::pair<unsigned char, std::size_t> lo; ///< LeftOvers.
  397. };
  398.  
  399. ///
  400. /// @brief Computes the minimum number of bits required to store the value of `n`.
  401. /// @param n    number to be evaluated
  402. /// @return Number of required bits.
  403. ///
  404. std::size_t required_bits(unsigned long int n)
  405. {
  406.     std::size_t r {1};
  407.  
  408.     while ((n >>= 1) != 0)
  409.         ++r;
  410.  
  411.     return r;
  412. }
  413.  
  414. ///
  415. /// @brief Compresses the contents of `is` and writes the result to `os`.
  416. /// @param [in] is      input stream
  417. /// @param [out] os     output stream
  418. ///
  419. void compress(std::istream &is, std::ostream &os)
  420. {
  421.     EncoderDictionary ed;
  422.     CodeWriter cw(os);
  423.     CodeType i {globals::dms}; // Index
  424.     char c;
  425.  
  426.     while (is.get(c))
  427.     {
  428.         // dictionary's maximum size was reached
  429.         if (ed.size() == globals::dms)
  430.         {
  431.             ed.reset();
  432.             cw.reset_bits();
  433.         }
  434.  
  435.         const CodeType temp {i};
  436.  
  437.         if ((i = ed.search_and_insert(temp, c)) == globals::dms)
  438.         {
  439.             cw.write(temp);
  440.             i = ed.search_initials(c);
  441.  
  442.             // theoretically, this should be a while() loop
  443.             if (required_bits(ed.size()) > cw.get_bits())
  444.                 cw.increase_bits();
  445.         }
  446.     }
  447.  
  448.     if (i != globals::dms)
  449.         cw.write(i);
  450. }
  451.  
  452. ///
  453. /// @brief Decompresses the contents of `is` and writes the result to `os`.
  454. /// @param [in] is      input stream
  455. /// @param [out] os     output stream
  456. ///
  457. void decompress(std::istream &is, std::ostream &os)
  458. {
  459.     std::vector<std::pair<CodeType, char>> dictionary;
  460.  
  461.     // "named" lambda function, used to reset the dictionary to its initial contents
  462.     const auto reset_dictionary = [&dictionary] {
  463.         dictionary.clear();
  464.         dictionary.reserve(globals::dms);
  465.  
  466.         const long int minc = std::numeric_limits<char>::min();
  467.         const long int maxc = std::numeric_limits<char>::max();
  468.  
  469.         for (long int c = minc; c <= maxc; ++c)
  470.             dictionary.push_back({globals::dms, static_cast<char> (c)});
  471.  
  472.         // add dummy elements for the metacodes
  473.         dictionary.push_back({0, '\x00'}); // MetaCode::Eof
  474.     };
  475.  
  476.     const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
  477.         static std::vector<char> s; // String
  478.  
  479.         s.clear();
  480.  
  481.         // the length of a string cannot exceed the dictionary's number of entries
  482.         s.reserve(globals::dms);
  483.  
  484.         while (k != globals::dms)
  485.         {
  486.             s.push_back(dictionary[k].second);
  487.             k = dictionary[k].first;
  488.         }
  489.  
  490.         std::reverse(s.begin(), s.end());
  491.         return &s;
  492.     };
  493.  
  494.     reset_dictionary();
  495.  
  496.     CodeReader cr(is);
  497.     CodeType i {globals::dms}; // Index
  498.     CodeType k; // Key
  499.  
  500.     while (cr.read(k))
  501.     {
  502.         // dictionary's maximum size was reached
  503.         if (dictionary.size() == globals::dms)
  504.         {
  505.             reset_dictionary();
  506.             cr.reset_bits();
  507.         }
  508.  
  509.         // theoretically, this should be a while() loop
  510.         if (required_bits(dictionary.size()) > cr.get_bits())
  511.             cr.increase_bits();
  512.  
  513.         if (k > dictionary.size())
  514.         {
  515.             std::clog << k << '\n';
  516.             throw std::runtime_error("invalid compressed code");
  517.         }
  518.  
  519.         const std::vector<char> *s; // String
  520.  
  521.         if (k == dictionary.size())
  522.         {
  523.             dictionary.push_back({i, rebuild_string(i)->front()});
  524.             s = rebuild_string(k);
  525.         }
  526.         else
  527.         {
  528.             s = rebuild_string(k);
  529.  
  530.             if (i != globals::dms)
  531.                 dictionary.push_back({i, s->front()});
  532.         }
  533.  
  534.         os.write(&s->front(), s->size());
  535.         i = k;
  536.     }
  537.  
  538.     if (cr.corrupted())
  539.         throw std::runtime_error("corrupted compressed file");
  540. }
  541.  
  542. ///
  543. /// @brief Prints usage information and a custom error message.
  544. /// @param s    custom error message to be printed
  545. /// @param su   Show Usage information
  546. ///
  547. void print_usage(const std::string &s = "", bool su = true)
  548. {
  549.     if (!s.empty())
  550.         std::cerr << "\nERROR: " << s << '\n';
  551.  
  552.     if (su)
  553.     {
  554.         std::cerr << "\nUsage:\n";
  555.         std::cerr << "\tprogram -flag input_file output_file\n\n";
  556.         std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
  557.         std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
  558.         std::cerr << "Examples:\n";
  559.         std::cerr << "\tlzw_v6.exe -c license.txt license.lzw\n";
  560.         std::cerr << "\tlzw_v6.exe -d license.lzw new_license.txt\n";
  561.     }
  562.  
  563.     std::cerr << std::endl;
  564. }
  565.  
  566. ///
  567. /// @brief Actual program entry point.
  568. /// @param argc             number of command line arguments
  569. /// @param [in] argv        array of command line arguments
  570. /// @retval EXIT_FAILURE    for failed operation
  571. /// @retval EXIT_SUCCESS    for successful operation
  572. ///
  573. int main(int argc, char *argv[])
  574. {
  575.     if (argc != 4)
  576.     {
  577.         print_usage("Wrong number of arguments.");
  578.         return EXIT_FAILURE;
  579.     }
  580.  
  581.     enum class Mode {
  582.         Compress,
  583.         Decompress
  584.     };
  585.  
  586.     Mode m;
  587.  
  588.     if (std::string(argv[1]) == "-c")
  589.         m = Mode::Compress;
  590.     else
  591.     if (std::string(argv[1]) == "-d")
  592.         m = Mode::Decompress;
  593.     else
  594.     {
  595.         print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
  596.         return EXIT_FAILURE;
  597.     }
  598.  
  599.     const std::size_t buffer_size {1024 * 1024};
  600.  
  601.     // these custom buffers should be larger than the default ones
  602.     const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
  603.     const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
  604.  
  605.     std::ifstream input_file;
  606.     std::ofstream output_file;
  607.  
  608.     input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
  609.     input_file.open(argv[2], std::ios_base::binary);
  610.  
  611.     if (!input_file.is_open())
  612.     {
  613.         print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
  614.         return EXIT_FAILURE;
  615.     }
  616.  
  617.     output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
  618.     output_file.open(argv[3], std::ios_base::binary);
  619.  
  620.     if (!output_file.is_open())
  621.     {
  622.         print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
  623.         return EXIT_FAILURE;
  624.     }
  625.  
  626.     try
  627.     {
  628.         input_file.exceptions(std::ios_base::badbit);
  629.         output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
  630.  
  631.         if (m == Mode::Compress)
  632.             compress(input_file, output_file);
  633.         else
  634.         if (m == Mode::Decompress)
  635.             decompress(input_file, output_file);
  636.     }
  637.     catch (const std::ios_base::failure &f)
  638.     {
  639.         print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
  640.         return EXIT_FAILURE;
  641.     }
  642.     catch (const std::exception &e)
  643.     {
  644.         print_usage(std::string("Caught exception: ") + e.what() + '.', false);
  645.         return EXIT_FAILURE;
  646.     }
  647.  
  648.     return EXIT_SUCCESS;
  649. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement