1. ///
  2. /// @file
  3. /// @author Julius Pettersson
  4. /// @copyright MIT/Expat License.
  5. /// @brief LZW file archiver
  6. /// @version 5
  7. /// @remark This version borrows heavily from Juha Nieminen's work.
  8. ///
  9. /// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line archiver.
  10. /// It uses the simpler fixed-width code compression method.
  11. /// It was written with Doxygen comments.
  12. ///
  13. /// http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
  14. /// http://marknelson.us/2011/11/08/lzw-revisited/
  15. /// http://www.cs.duke.edu/csed/curious/compression/lzw.html
  16. /// http://warp.povusers.org/EfficientLZW/index.html
  17. /// http://en.cppreference.com/
  18. /// http://www.doxygen.org/
  19. ///
  20.  
  21. #include <algorithm>
  22. #include <array>
  23. #include <climits>
  24. #include <cstddef>
  25. #include <cstdint>
  26. #include <cstdlib>
  27. #include <exception>
  28. #include <fstream>
  29. #include <ios>
  30. #include <iostream>
  31. #include <istream>
  32. #include <limits>
  33. #include <memory>
  34. #include <ostream>
  35. #include <stdexcept>
  36. #include <string>
  37. #include <utility>
  38. #include <vector>
  39.  
  40. namespace {
  41.  
  42. using CodeType = std::uint16_t; ///< Type used to store and retrieve codes.
  43.  
  44. namespace globals {
  45.  
  46. /// Dictionary Maximum Size (when reached, the dictionary will be reset)
  47. const CodeType dms {std::numeric_limits<CodeType>::max()};
  48.  
  49. } // namespace globals
  50.  
  51. ///
  52. /// @brief Compresses the contents of `is` and writes the result to `os`.
  53. /// @param [in] is      input stream
  54. /// @param [out] os     output stream
  55. ///
  56. void compress(std::istream &is, std::ostream &os)
  57. {
  58.     class EncoderDictionary {
  59.  
  60.         struct Node {
  61.             CodeType    prefix; ///< Code of string to which the byte `c` is appended.
  62.             CodeType    first;  ///< Code of first child string.
  63.             char        c;      ///< Byte.
  64.             CodeType    left;   ///< Code of child string with byte < `c`.
  65.             CodeType    right;  ///< Code of child string with byte > `c`.
  66.  
  67.             explicit Node(char c, CodeType prefix = globals::dms):
  68.                 prefix(prefix),
  69.                 first(globals::dms),
  70.                 c(c),
  71.                 left(globals::dms),
  72.                 right(globals::dms)
  73.             {
  74.             }
  75.         };
  76.  
  77.     public:
  78.  
  79.         EncoderDictionary()
  80.         {
  81.             const int minc = std::numeric_limits<char>::min();
  82.             const int maxc = std::numeric_limits<char>::max();
  83.             CodeType k {0};
  84.  
  85.             for (int c = minc; c <= maxc; ++c)
  86.                 initials[static_cast<unsigned char> (c)] = k++;
  87.  
  88.             reset();
  89.         }
  90.  
  91.         ///
  92.         /// @brief Resets dictionary to its initial contents.
  93.         ///
  94.         void reset()
  95.         {
  96.             vn.clear();
  97.             vn.reserve(globals::dms);
  98.  
  99.             const int minc = std::numeric_limits<char>::min();
  100.             const int maxc = std::numeric_limits<char>::max();
  101.  
  102.             for (int c = minc; c <= maxc; ++c)
  103.                 vn.push_back(Node(c));
  104.         }
  105.  
  106.         ///
  107.         /// @brief Searches for a pair (`i`, `c`) and inserts the pair if it wasn't found.
  108.         /// @param i    code to search for
  109.         /// @param c    attached byte to search for
  110.         /// @return The index of the pair, if it was found.
  111.         /// @retval globals::dms    if the pair wasn't found
  112.         ///
  113.         CodeType search_and_insert(CodeType i, char c)
  114.         {
  115.             // dictionary's maximum size was reached
  116.             if (vn.size() == globals::dms)
  117.                 reset();
  118.  
  119.             if (i == globals::dms)
  120.                 return search_initials(c);
  121.  
  122.             const CodeType vn_size = vn.size();
  123.             CodeType ci {vn[i].first}; // Current Index
  124.  
  125.             if (ci != globals::dms)
  126.                 while (true)
  127.                     if (c < vn[ci].c)
  128.                     {
  129.                         if (vn[ci].left == globals::dms)
  130.                         {
  131.                             vn[ci].left = vn_size;
  132.                             break;
  133.                         }
  134.                         else
  135.                             ci = vn[ci].left;
  136.                     }
  137.                     else
  138.                     if (c > vn[ci].c)
  139.                     {
  140.                         if (vn[ci].right == globals::dms)
  141.                         {
  142.                             vn[ci].right = vn_size;
  143.                             break;
  144.                         }
  145.                         else
  146.                             ci = vn[ci].right;
  147.                     }
  148.                     else // c == vn[ci].c
  149.                         return ci;
  150.             else
  151.                 vn[i].first = vn_size;
  152.  
  153.             vn.push_back(Node(c, i));
  154.             return globals::dms;
  155.         }
  156.  
  157.         ///
  158.         /// @brief Fakes a search for byte `c` in the one-byte area of the dictionary.
  159.         /// @param c    byte to search for
  160.         /// @return The code associated to the searched byte.
  161.         ///
  162.         CodeType search_initials(char c) const
  163.         {
  164.             return initials[static_cast<unsigned char> (c)];
  165.         }
  166.  
  167.     private:
  168.  
  169.         std::vector<Node> vn;
  170.  
  171.         /// Cheat sheet for mapping one-byte strings to their codes.
  172.         std::array<CodeType, 1u << CHAR_BIT> initials;
  173.     };
  174.  
  175.     EncoderDictionary ed;
  176.     CodeType i {globals::dms}; // Index
  177.     char c;
  178.  
  179.     while (is.get(c))
  180.     {
  181.         const CodeType temp {i};
  182.  
  183.         if ((i = ed.search_and_insert(temp, c)) == globals::dms)
  184.         {
  185.             os.write(reinterpret_cast<const char *> (&temp), sizeof (CodeType));
  186.             i = ed.search_initials(c);
  187.         }
  188.     }
  189.  
  190.     if (i != globals::dms)
  191.         os.write(reinterpret_cast<const char *> (&i), sizeof (CodeType));
  192. }
  193.  
  194. ///
  195. /// @brief Decompresses the contents of `is` and writes the result to `os`.
  196. /// @param [in] is      input stream
  197. /// @param [out] os     output stream
  198. ///
  199. void decompress(std::istream &is, std::ostream &os)
  200. {
  201.     std::vector<std::pair<CodeType, char>> dictionary;
  202.  
  203.     // "named" lambda function, used to reset the dictionary to its initial contents
  204.     const auto reset_dictionary = [&dictionary] {
  205.         dictionary.clear();
  206.         dictionary.reserve(globals::dms);
  207.  
  208.         const int minc = std::numeric_limits<char>::min();
  209.         const int maxc = std::numeric_limits<char>::max();
  210.  
  211.         for (int c = minc; c <= maxc; ++c)
  212.             dictionary.push_back({globals::dms, static_cast<char> (c)});
  213.     };
  214.  
  215.     const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
  216.         static std::vector<char> s; // String
  217.  
  218.         s.clear();
  219.  
  220.         // the length of a string cannot exceed the dictionary's number of entries
  221.         s.reserve(globals::dms);
  222.  
  223.         while (k != globals::dms)
  224.         {
  225.             s.push_back(dictionary.at(k).second);
  226.             k = dictionary.at(k).first;
  227.         }
  228.  
  229.         std::reverse(s.begin(), s.end());
  230.         return &s;
  231.     };
  232.  
  233.     reset_dictionary();
  234.  
  235.     CodeType i {globals::dms}; // Index
  236.     CodeType k; // Key
  237.  
  238.     while (is.read(reinterpret_cast<char *> (&k), sizeof (CodeType)))
  239.     {
  240.         // dictionary's maximum size was reached
  241.         if (dictionary.size() == globals::dms)
  242.             reset_dictionary();
  243.  
  244.         if (k > dictionary.size())
  245.             throw std::runtime_error("invalid compressed code");
  246.  
  247.         const std::vector<char> *s; // String
  248.  
  249.         if (k == dictionary.size())
  250.         {
  251.             dictionary.push_back({i, rebuild_string(i)->front()});
  252.             s = rebuild_string(k);
  253.         }
  254.         else
  255.         {
  256.             s = rebuild_string(k);
  257.  
  258.             if (i != globals::dms)
  259.                 dictionary.push_back({i, s->front()});
  260.         }
  261.  
  262.         os.write(&s->front(), s->size());
  263.         i = k;
  264.     }
  265.  
  266.     if (!is.eof() || is.gcount() != 0)
  267.         throw std::runtime_error("corrupted compressed file");
  268. }
  269.  
  270. ///
  271. /// @brief Prints usage information and a custom error message.
  272. /// @param s    custom error message to be printed
  273. /// @param su   Show Usage information
  274. ///
  275. void print_usage(const std::string &s = "", bool su = true)
  276. {
  277.     if (!s.empty())
  278.         std::cerr << "\nERROR: " << s << '\n';
  279.  
  280.     if (su)
  281.     {
  282.         std::cerr << "\nUsage:\n";
  283.         std::cerr << "\tprogram -flag input_file output_file\n\n";
  284.         std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
  285.         std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
  286.         std::cerr << "Examples:\n";
  287.         std::cerr << "\tlzw_v5.exe -c license.txt license.lzw\n";
  288.         std::cerr << "\tlzw_v5.exe -d license.lzw new_license.txt\n";
  289.     }
  290.  
  291.     std::cerr << std::endl;
  292. }
  293.  
  294. } // namespace
  295.  
  296. ///
  297. /// @brief Actual program entry point.
  298. /// @param argc             number of command line arguments
  299. /// @param [in] argv        array of command line arguments
  300. /// @retval EXIT_FAILURE    for failed operation
  301. /// @retval EXIT_SUCCESS    for successful operation
  302. ///
  303. int main(int argc, char *argv[])
  304. {
  305.     if (argc != 4)
  306.     {
  307.         print_usage("Wrong number of arguments.");
  308.         return EXIT_FAILURE;
  309.     }
  310.  
  311.     enum class Mode {
  312.         Compress,
  313.         Decompress
  314.     };
  315.  
  316.     Mode m;
  317.  
  318.     if (std::string(argv[1]) == "-c")
  319.         m = Mode::Compress;
  320.     else
  321.     if (std::string(argv[1]) == "-d")
  322.         m = Mode::Decompress;
  323.     else
  324.     {
  325.         print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
  326.         return EXIT_FAILURE;
  327.     }
  328.  
  329.     const std::size_t buffer_size {1024 * 1024};
  330.  
  331.     // these custom buffers should be larger than the default ones
  332.     const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
  333.     const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
  334.  
  335.     std::ifstream input_file;
  336.     std::ofstream output_file;
  337.  
  338.     input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
  339.     input_file.open(argv[2], std::ios_base::binary);
  340.  
  341.     if (!input_file.is_open())
  342.     {
  343.         print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
  344.         return EXIT_FAILURE;
  345.     }
  346.  
  347.     output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
  348.     output_file.open(argv[3], std::ios_base::binary);
  349.  
  350.     if (!output_file.is_open())
  351.     {
  352.         print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
  353.         return EXIT_FAILURE;
  354.     }
  355.  
  356.     try
  357.     {
  358.         input_file.exceptions(std::ios_base::badbit);
  359.         output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
  360.  
  361.         if (m == Mode::Compress)
  362.             compress(input_file, output_file);
  363.         else
  364.         if (m == Mode::Decompress)
  365.             decompress(input_file, output_file);
  366.     }
  367.     catch (const std::ios_base::failure &f)
  368.     {
  369.         print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
  370.         return EXIT_FAILURE;
  371.     }
  372.     catch (const std::exception &e)
  373.     {
  374.         print_usage(std::string("Caught exception: ") + e.what() + '.', false);
  375.         return EXIT_FAILURE;
  376.     }
  377.  
  378.     return EXIT_SUCCESS;
  379. }