Advertisement
jules0707

Huffman

Feb 27th, 2017
614
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 11.66 KB | None | 0 0
  1. package patmat
  2.  
  3. import common._
  4. import scala.collection.parallel.RemainsIterator
  5.  
  6. /**
  7.  * Assignment 4: Huffman coding
  8.  *
  9.  */
  10. object Huffman {
  11.  
  12.   /**
  13.    * A huffman code is represented by a binary tree.
  14.    *
  15.    * Every `Leaf` node of the tree represents one character of the alphabet that the tree can encode.
  16.    * The weight of a `Leaf` is the frequency of appearance of the character.
  17.    *
  18.    * The branches of the huffman tree, the `Fork` nodes, represent a set containing all the characters
  19.    * present in the leaves below it. The weight of a `Fork` node is the sum of the weights of these
  20.    * leaves.
  21.    */
  22.   abstract class CodeTree
  23.   case class Fork(left: CodeTree, right: CodeTree, chars: List[Char], weight: Int) extends CodeTree
  24.   case class Leaf(char: Char, weight: Int) extends CodeTree
  25.  
  26.   // Part 1: Basics
  27.   def weight(tree: CodeTree): Int = tree match {
  28.     case Leaf(_, n)       => n
  29.     case Fork(_, _, _, w) => w
  30.   }
  31.  
  32.   def chars(tree: CodeTree): List[Char] = tree match {
  33.     case Leaf(x, _)       => List(x)
  34.     case Fork(_, _, c, _) => c
  35.   }
  36.  
  37.   def makeCodeTree(left: CodeTree, right: CodeTree) =
  38.     Fork(left, right, chars(left) ::: chars(right), weight(left) + weight(right))
  39.  
  40.   // Part 2: Generating Huffman trees
  41.  
  42.   /**
  43.    * In this assignment, we are working with lists of characters. This function allows
  44.    * you to easily create a character list from a given string.
  45.    */
  46.   def string2Chars(str: String): List[Char] = str.toList
  47.  
  48.   /**
  49.    * This function computes for each unique character in the list `chars` the number of
  50.    * times it occurs. For example, the invocation
  51.    *
  52.    *   times(List('a', 'b', 'a'))
  53.    *
  54.    * should return the following (the order of the resulting list is not important):
  55.    *
  56.    *   List(('a', 2), ('b', 1))
  57.    *
  58.    * The type `List[(Char, Int)]` denotes a list of pairs, where each pair consists of a
  59.    * character and an integer. Pairs can be constructed easily using parentheses:
  60.    *
  61.    *   val pair: (Char, Int) = ('c', 1)
  62.    *
  63.    * In order to access the two elements of a pair, you can use the accessors `_1` and `_2`:
  64.    *
  65.    *   val theChar = pair._1
  66.    *   val theInt  = pair._2
  67.    *
  68.    * Another way to deconstruct a pair is using pattern matching:
  69.    *
  70.    *   pair match {
  71.    *     case (theChar, theInt) =>
  72.    *       println("character is: "+ theChar)
  73.    *       println("integer is  : "+ theInt)
  74.    *   }
  75.    */
  76.  
  77.   def times(chars: List[Char]): List[(Char, Int)] = {
  78.  
  79.     def getPair(list: List[(Char, Int)], char: Char): (Char, Int) = {
  80.       if (list.isEmpty) null
  81.       else if (list.head._1 == char) list.head
  82.       else getPair(list.tail, char)
  83.     }
  84.  
  85.     def collect(list: List[Char], result: List[(Char, Int)]): List[(Char, Int)] = {
  86.       if (list.isEmpty) result
  87.       else {
  88.         getPair(result, list.head) match {
  89.           case (theChar, theInt) => collect(list.tail, (list.head, theInt + 1) ::
  90.             result.filterNot(p => p._1 == theChar))
  91.           case _ => collect(list.tail, (list.head, 1) :: result)
  92.         }
  93.       }
  94.     }
  95.     collect(chars, List.empty)
  96.   }
  97.  
  98.   /**
  99.    * Returns a list of `Leaf` nodes for a given frequency table `freqs`.
  100.    *
  101.    * The returned list should be ordered by ascending weights (i.e. the
  102.    * head of the list should have the smallest weight), where the weight
  103.    * of a leaf is the frequency of the character.
  104.    */
  105.   def makeOrderedLeafList(freqs: List[(Char, Int)]): List[Leaf] = {
  106.  
  107.     def order(input: List[(Char, Int)], result: List[Leaf]): List[Leaf] = {
  108.       if (input.isEmpty) result.sortWith(_.weight < _.weight)
  109.       else input.head match {
  110.         case (char, weight) => order(input.tail, Leaf(char, weight) :: result)
  111.       }
  112.     }
  113.     order(freqs, List.empty)
  114.   }
  115.  
  116.   /**
  117.    * Checks whether the list `trees` contains only one single code tree.
  118.    */
  119.   def singleton(trees: List[CodeTree]): Boolean = trees.size == 1
  120.  
  121.   /**
  122.    * The parameter `trees` of this function is a list of code trees ordered
  123.    * by ascending weights.
  124.    *
  125.    * This function takes the first two elements of the list `trees` and combines
  126.    * them into a single `Fork` node. This node is then added back into the
  127.    * remaining elements of `trees` at a position such that the ordering by weights
  128.    * is preserved.
  129.    *
  130.    * If `trees` is a list of less than two elements, that list should be returned
  131.    * unchanged.
  132.    */
  133.   def combine(trees: List[CodeTree]): List[CodeTree] = trees match {
  134.     case left :: right :: cs => (makeCodeTree(left, right) :: cs)
  135.       .sortWith((t1, t2) => weight(t1) < weight(t2))
  136.     case _ => trees
  137.   }
  138.  
  139.   /**
  140.    * This function will be called in the following way:
  141.    *
  142.    *   until(singleton, combine)(trees)
  143.    *
  144.    * where `trees` is of type `List[CodeTree]`, `singleton` and `combine` refer to
  145.    * the two functions defined above.
  146.    *
  147.    * In such an invocation, `until` should call the two functions until the list of
  148.    * code trees contains only one single tree, and then return that singleton list.
  149.    *
  150.    * Hint: before writing the implementation,
  151.    *  - start by defining the parameter types such that the above example invocation
  152.    *    is valid. The parameter types of `until` should match the argument types of
  153.    *    the example invocation. Also define the return type of the `until` function.
  154.    *  - try to find sensible parameter names for `xxx`, `yyy` and `zzz`.
  155.    */
  156.   def until(p: List[CodeTree] => Boolean, q: List[CodeTree] => List[CodeTree])(trees: List[CodeTree]): List[CodeTree] =
  157.     if (p(trees)) trees
  158.     else until(p, q)(q(trees))
  159.  
  160.   /**
  161.    * This function creates a code tree which is optimal to encode the text `chars`.
  162.    *
  163.    * The parameter `chars` is an arbitrary text. This function extracts the character
  164.    * frequencies from that text and creates a code tree based on them.
  165.    */
  166.   def createCodeTree(chars: List[Char]): CodeTree =
  167.     until(singleton, combine)(makeOrderedLeafList(times(chars))).head
  168.  
  169.   // Part 3: Decoding
  170.  
  171.   type Bit = Int
  172.  
  173.   /**
  174.    * This function decodes the bit sequence `bits` using the code tree `tree` and returns
  175.    * the resulting list of characters.
  176.    */
  177.  
  178.   def decode(tree: CodeTree, bits: List[Bit]): List[Char] =
  179.     {
  180.       //      def traverse(remaining: CodeTree, bits: List[Bit]): List[Char] =
  181.       //        remaining match {
  182.       //          case Leaf(c, _) if (bits.isEmpty) => List(c)
  183.       //          case Leaf(c, _) => c :: traverse(tree, bits)
  184.       //          case Fork(left, right, _, _) if (bits.head == 0) => traverse(left, bits.tail)
  185.       //          case Fork(left, right, _, _) => traverse(right, bits.tail)
  186.       //        }
  187.       //      traverse(tree, bits)
  188.       //    }
  189.  
  190.       val finalTree = tree
  191.  
  192.       tree match {
  193.  
  194.         case Leaf(c, _) if (bits.isEmpty)         => List(c)
  195.         case Leaf(c, _)                           => c :: decode(finalTree, bits)
  196.         case Fork(l, r, _, _) if (bits.head == 0) => decode(l, bits.tail)
  197.         case Fork(l, r, _, _)                     => decode(r, bits.tail)
  198.       }
  199.     }
  200.   /**
  201.    * A Huffman coding tree for the French language.
  202.    * Generated from the data given at
  203.    *   http://fr.wikipedia.org/wiki/Fr%C3%A9quence_d%27apparition_des_lettres_en_fran%C3%A7ais
  204.    */
  205.   val frenchCode: CodeTree = Fork(Fork(Fork(Leaf('s', 121895), Fork(Leaf('d', 56269), Fork(Fork(Fork(Leaf('x', 5928), Leaf('j', 8351), List('x', 'j'), 14279), Leaf('f', 16351), List('x', 'j', 'f'), 30630), Fork(Fork(Fork(Fork(Leaf('z', 2093), Fork(Leaf('k', 745), Leaf('w', 1747), List('k', 'w'), 2492), List('z', 'k', 'w'), 4585), Leaf('y', 4725), List('z', 'k', 'w', 'y'), 9310), Leaf('h', 11298), List('z', 'k', 'w', 'y', 'h'), 20608), Leaf('q', 20889), List('z', 'k', 'w', 'y', 'h', 'q'), 41497), List('x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 72127), List('d', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 128396), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 250291), Fork(Fork(Leaf('o', 82762), Leaf('l', 83668), List('o', 'l'), 166430), Fork(Fork(Leaf('m', 45521), Leaf('p', 46335), List('m', 'p'), 91856), Leaf('u', 96785), List('m', 'p', 'u'), 188641), List('o', 'l', 'm', 'p', 'u'), 355071), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q', 'o', 'l', 'm', 'p', 'u'), 605362), Fork(Fork(Fork(Leaf('r', 100500), Fork(Leaf('c', 50003), Fork(Leaf('v', 24975), Fork(Leaf('g', 13288), Leaf('b', 13822), List('g', 'b'), 27110), List('v', 'g', 'b'), 52085), List('c', 'v', 'g', 'b'), 102088), List('r', 'c', 'v', 'g', 'b'), 202588), Fork(Leaf('n', 108812), Leaf('t', 111103), List('n', 't'), 219915), List('r', 'c', 'v', 'g', 'b', 'n', 't'), 422503), Fork(Leaf('e', 225947), Fork(Leaf('i', 115465), Leaf('a', 117110), List('i', 'a'), 232575), List('e', 'i', 'a'), 458522), List('r', 'c', 'v', 'g', 'b', 'n', 't', 'e', 'i', 'a'), 881025), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q', 'o', 'l', 'm', 'p', 'u', 'r', 'c', 'v', 'g', 'b', 'n', 't', 'e', 'i', 'a'), 1486387)
  206.  
  207.   /**
  208.    * What does the secret message say? Can you decode it?
  209.    * For the decoding use the `frenchCode' Huffman tree defined above.
  210.    */
  211.   val secret: List[Bit] = List(0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1)
  212.  
  213.   /**
  214.    * Write a function that returns the decoded secret
  215.    */
  216.   def decodedSecret: List[Char] = decode(frenchCode, secret)
  217.  
  218.   // Part 4a: Encoding using Huffman tree
  219.  
  220.   /**
  221.    * This function encodes `text` using the code tree `tree`
  222.    * into a sequence of bits.
  223.    */
  224.   def encode(tree: CodeTree)(text: List[Char]): List[Bit] = {
  225.  
  226.     def lookup(tree: CodeTree)(c: Char): List[Bit] = tree match {
  227.       case Leaf(_, _) => List()
  228.       case Fork(left, right, _, _) if chars(left).contains(c) => 0 :: lookup(left)(c)
  229.       case Fork(left, right, _, _) if chars(right).contains(c) => 1 :: lookup(right)(c)
  230.     }
  231.     text flatMap lookup(tree)
  232.   }
  233.  
  234.   // Part 4b: Encoding using code table
  235.  
  236.   type CodeTable = List[(Char, List[Bit])]
  237.   type Code = (Char, List[Bit])
  238.  
  239.   /**
  240.    * This function returns the bit sequence that represents the character `char` in
  241.    * the code table `table`.
  242.    */
  243.   def codeBits(table: CodeTable)(char: Char): List[Bit] =
  244.     table.filter((code) => code._1 == char).head._2
  245.  
  246.   /**
  247.    * Given a code tree, create a code table which contains, for every character in the
  248.    * code tree, the sequence of bits representing that character.
  249.    *
  250.    * Hint: think of a recursive solution: every sub-tree of the code tree `tree` is itself
  251.    * a valid code tree that can be represented as a code table. Using the code tables of the
  252.    * sub-trees, think of how to build the code table for the entire tree.
  253.    */
  254.   def convert(tree: CodeTree): CodeTable = tree match {
  255.  
  256.     case Leaf(c, w)               => List((c, List()))
  257.     case Fork(left, right, cs, w) => mergeCodeTables(convert(left), convert(right))
  258.   }
  259.  
  260.   /**
  261.    * This function takes two code tables and merges them into one. Depending on how you
  262.    * use it in the `convert` method above, this merge method might also do some transformations
  263.    * on the two parameter code tables.
  264.    */
  265.   def mergeCodeTables(a: CodeTable, b: CodeTable): CodeTable = {
  266.  
  267.     def prepend(b: Bit)(code: Code): Code =
  268.       (code._1, b :: code._2)
  269.     a.map(prepend(0)) ::: b.map(prepend(1))
  270.   }
  271.  
  272.   /**
  273.    * This function encodes `text` according to the code tree `tree`.
  274.    *
  275.    * To speed up the encoding process, it first converts the code tree to a code table
  276.    * and then uses it to perform the actual encoding.
  277.    */
  278.   def quickEncode(tree: CodeTree)(text: List[Char]): List[Bit] =
  279.     text flatMap codeBits(convert(tree))
  280. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement