Advertisement
timothy235

sicp-2-3-4-huffman-encoding-trees

Feb 25th, 2016
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Racket 9.44 KB | None | 0 0
  1. #lang racket
  2.  
  3. (define (make-leaf symbol weight)
  4.   (list 'leaf symbol weight))
  5. (define (leaf? object)
  6.   (eq? (first object) 'leaf))
  7. (define (symbol-leaf x) (second x))
  8. (define (weight-leaf x) (third x))
  9.  
  10. (define (make-code-tree left right)
  11.   (list left
  12.         right
  13.         (append (symbols left) (symbols right))
  14.         (+ (weight left) (weight right))))
  15.  
  16. (define (left-branch tree) (first tree))
  17. (define (right-branch tree) (second tree))
  18. (define (symbols tree)
  19.   (if (leaf? tree)
  20.     (list (symbol-leaf tree))
  21.     (third tree)))
  22. (define (weight tree)
  23.   (if (leaf? tree)
  24.     (weight-leaf tree)
  25.     (fourth tree)))
  26.  
  27. (define (decode bits tree)
  28.   (define (decode-1 bits current-branch)
  29.     (cond [(empty? bits) empty]
  30.           [else
  31.             (define next-branch (choose-branch (first bits) current-branch))
  32.             (cond [(leaf? next-branch)
  33.                    (cons (symbol-leaf next-branch)
  34.                          (decode-1 (rest bits) tree))]
  35.                   [else (decode-1 (rest bits) next-branch)])]))
  36.   (decode-1 bits tree))
  37. (define (choose-branch bit branch)
  38.   (cond [(zero? bit) (left-branch branch)]
  39.         [(= bit 1) (right-branch branch)]
  40.         [else (error "bad bit -- CHOOSE-BRANCH" bit)]))
  41.  
  42. ;; represent a set of leaves and trees as an ordered list, ordered by weight
  43.  
  44. (define (adjoin-set x st)
  45.   (cond [(empty? st) (list x)]
  46.         [(< (weight x) (weight (first st))) (cons x st)]
  47.         [else (cons (first st)
  48.                     (adjoin-set x (rest st)))]))
  49.  
  50. (define (make-leaf-set pairs)
  51.   (cond [(empty? pairs) empty]
  52.         [else
  53.           (define pair (first pairs))
  54.           (adjoin-set (make-leaf (first pair) (second pair))
  55.                       (make-leaf-set (rest pairs)))]))
  56.  
  57. ;;;;;;;;;;
  58. ;; 2.67 ;;
  59. ;;;;;;;;;;
  60.  
  61. (define sample-tree
  62.   (make-code-tree (make-leaf 'A 4)
  63.                   (make-code-tree (make-leaf 'B 2)
  64.                                   (make-code-tree (make-leaf 'D 1)
  65.                                                   (make-leaf 'C 1)))))
  66. (define sample-message '(0 1 1 0 0 1 0 1 0 1 1 1 0))
  67.  
  68. (decode sample-message sample-tree)
  69. ;; '(A D A B B C A)
  70.  
  71. ;;;;;;;;;;
  72. ;; 2.68 ;;
  73. ;;;;;;;;;;
  74.  
  75. (define (encode message tree)
  76.   (if (empty? message)
  77.     empty
  78.     (append (encode-symbol (first message) tree)
  79.             (encode (rest message) tree))))
  80. (define (encode-symbol symbol tree)
  81.   (define (loop code tree)
  82.     (cond [(leaf? tree) (reverse code)]
  83.           [(member symbol (symbols (left-branch tree)))
  84.            (loop (cons '0 code) (left-branch tree))]
  85.           [(member symbol (symbols (right-branch tree)))
  86.            (loop (cons '1 code) (right-branch tree))]
  87.           [else (error "symbol not in tree -- ENCODE-SYMBOL" symbol)]))
  88.   (loop empty tree))
  89.  
  90. (encode '(A D A B B C A) sample-tree)
  91. ;; '(0 1 1 0 0 1 0 1 0 1 1 1 0)
  92.  
  93. (equal? sample-message
  94.         (encode (decode sample-message
  95.                         sample-tree)
  96.                 sample-tree))
  97. ;; #t
  98.  
  99. ;;;;;;;;;;
  100. ;; 2.69 ;;
  101. ;;;;;;;;;;
  102.  
  103. (define (generate-huffman-tree pairs)
  104.   (successive-merge (make-leaf-set pairs)))
  105. (define (successive-merge leaf-set) ; can assume leaf-set sorted
  106.   (if (= (length leaf-set) 2)
  107.     (make-code-tree (first leaf-set) (second leaf-set))
  108.     (successive-merge
  109.       (adjoin-set (make-code-tree (first leaf-set) (second leaf-set))
  110.                   (rest (rest leaf-set))))))
  111.  
  112. ;; Note that the pairs passed to generate-huffman-tree do not have to be sorted.
  113. ;; The pairs get sorted by make-leaf-set which uses adjoin-set.
  114.  
  115. (define sample-pairs '((A 4)
  116.                        (B 2)
  117.                        (C 1)
  118.                        (D 1)))
  119.  
  120. (generate-huffman-tree sample-pairs)
  121. ;; '((leaf A 4)
  122.   ;; ((leaf B 2) ((leaf D 1) (leaf C 1) (D C) 2) (B D C) 4)
  123.   ;; (A B D C)
  124.   ;; 8)
  125.  
  126. ;;;;;;;;;;
  127. ;; 2.70 ;;
  128. ;;;;;;;;;;
  129.  
  130. (define rock-pairs '((get 2)
  131.                      (a 2)
  132.                      (job 2)
  133.                      (sha 3)
  134.                      (na 16)
  135.                      (wah 1)
  136.                      (yip 9)
  137.                      (boom 1)))
  138.  
  139. (define rock-tree (generate-huffman-tree rock-pairs))
  140. (define rock-song '(get a job sha na na na na na na na na
  141.                         get a job sha na na na na na na na na
  142.                         wah yip yip yip yip yip yip yip yip yip
  143.                         sha boom))
  144. (length rock-song)
  145. ;; 36
  146.  
  147. (encode '(get) rock-tree)
  148. ;; '(1 1 0 0)
  149. (encode '(a) rock-tree)
  150. ;; '(1 1 1 1 1)
  151. (encode '(job) rock-tree)
  152. ;; '(1 1 1 1 0)
  153. (encode '(sha) rock-tree)
  154. ;; '(1 1 1 0)
  155. (encode '(na) rock-tree)
  156. ;; '(0)
  157. (encode '(wah) rock-tree)
  158. ;; '(1 1 0 1 1)
  159. (encode '(yip) rock-tree)
  160. ;; '(1 0)
  161. (encode '(boom) rock-tree)
  162. ;; '(1 1 0 1 0)
  163.  
  164. (define rock-code (encode rock-song rock-tree))
  165. (length rock-code)
  166. ;; 84
  167.  
  168. ;; rock-code
  169. ;; ;; '(1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0
  170.   ;; ;; 0 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1
  171.   ;; ;; 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 0
  172.   ;; ;; 1 0 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 1 0)
  173.  
  174. (decode rock-code rock-tree)
  175. ;; '(get a job
  176.       ;; sha na na na na na na na na
  177.       ;; get a job
  178.       ;; sha na na na na na na na na
  179.       ;; wah yip yip yip yip yip yip yip yip yip
  180.       ;; sha boom)
  181.  
  182. ;; There are eight symbols in the rock alphabet.  So a fixed-length code would
  183. ;; need three bits per word to encode all eight symbols.  So the rock song would
  184. ;; require a code of 36 * 3 = 108 bits.  The huffman code is only 84 bits which
  185. ;; is a savings of 24 bits or almost 29%.
  186.  
  187. ;;;;;;;;;;
  188. ;; 2.71 ;;
  189. ;;;;;;;;;;
  190.  
  191. (define five-powers-of-two-pairs '((a 16)
  192.                                    (b 8)
  193.                                    (c 4)
  194.                                    (d 2)
  195.                                    (e 1)))
  196.  
  197. (define five-powers-of-two-tree (generate-huffman-tree five-powers-of-two-pairs))
  198.  
  199. (encode '(a) five-powers-of-two-tree)
  200. ;; '(1)
  201. (encode '(b) five-powers-of-two-tree)
  202. ;; '(0 1)
  203. (encode '(c) five-powers-of-two-tree)
  204. ;; '(0 0 1)
  205. (encode '(d) five-powers-of-two-tree)
  206. ;; '(0 0 0 1)
  207. (encode '(e) five-powers-of-two-tree)
  208. ;; '(0 0 0 0)
  209.  
  210. (define ten-powers-of-two-pairs '((a 512)
  211.                                   (b 256)
  212.                                   (c 128)
  213.                                   (d 64)
  214.                                   (e 32)
  215.                                   (f 16)
  216.                                   (g 8)
  217.                                   (h 4)
  218.                                   (i 2)
  219.                                   (j 1)))
  220.  
  221. (define ten-powers-of-two-tree (generate-huffman-tree ten-powers-of-two-pairs))
  222.  
  223. (encode '(a) ten-powers-of-two-tree)
  224. ;; '(1)
  225. (encode '(b) ten-powers-of-two-tree)
  226. ;; '(0 1)
  227. (encode '(c) ten-powers-of-two-tree)
  228. ;; '(0 0 1)
  229. (encode '(d) ten-powers-of-two-tree)
  230. ;; '(0 0 0 1)
  231. (encode '(e) ten-powers-of-two-tree)
  232. ;; '(0 0 0 0 1)
  233. (encode '(f) ten-powers-of-two-tree)
  234. ;; '(0 0 0 0 0 1)
  235. (encode '(g) ten-powers-of-two-tree)
  236. ;; '(0 0 0 0 0 0 1)
  237. (encode '(h) ten-powers-of-two-tree)
  238. ;; '(0 0 0 0 0 0 0 1)
  239. (encode '(i) ten-powers-of-two-tree)
  240. ;; '(0 0 0 0 0 0 0 0 1)
  241. (encode '(j) ten-powers-of-two-tree)
  242. ;; '(0 0 0 0 0 0 0 0 0)
  243.  
  244. ;; With frequencies that follow the powers of two, the huffman tree will be
  245. ;; maximally unbalanced.  The most common letter will be encoded with one bit, and
  246. ;; the least common letter will be encoded with n - 1 bits.
  247.  
  248. ;; Note also that using the powers of two for the frequencies is very inefficient.
  249. ;; The weights of the elements grow too fast.  If you used some non-exponential
  250. ;; weighting system, where the weight of an element was less than the sum of the
  251. ;; weights of the less frequent elements, then you would get a better huffman tree
  252. ;; with more efficient encoding.
  253.  
  254. ;; The difference is whether the leaf for a given symbol is a right or left branch
  255. ;; of its parent node.  If the weight is greater than the sum of the weights of
  256. ;; all the less frequent elements, it is a right branch.  If it is less, it is a
  257. ;; left branch.
  258.  
  259. ;; When encoding an element, if its leaf is a right branch, you have to first
  260. ;; search through all the elements in the left branch before finding the leaf.
  261. ;; But that search is eliminated if the leaf is a left branch.
  262.  
  263. ;; For example, in the ten-powers-of-two-tree, if the weight of 'a were 510
  264. ;; instead of 512, then its code would be '(0) instead of '(1), and encoding '(a)
  265. ;; would be O(1) instead of O(n).
  266.  
  267. (encode '(a) (generate-huffman-tree (cons '(a 510) (rest ten-powers-of-two-pairs))))
  268. ;; '(0)
  269.  
  270. ;;;;;;;;;;
  271. ;; 2.72 ;;
  272. ;;;;;;;;;;
  273.  
  274. ;; Encoding a symbol requires traveling down the tree until we reach a leaf.  At
  275. ;; each node, we have to check whether our symbol is in the list of symbols for
  276. ;; the left branch or not.  That is O(# symbols in left branch).  If it is, we go
  277. ;; left.  If it is not, we then have to check whether our symbol is in the list of
  278. ;; symbols for the right branch or not.  That is O(# symbols in right branch).  If
  279. ;; it is, then we go right.  If it is not, we signal an error.
  280.  
  281. ;; So, in the worst-case scenario, we would have to search through all the symbols
  282. ;; at each level, and the encoding would be O(n * # levels) where n is the number
  283. ;; of letters in the alphabet.
  284.  
  285. ;; For the powers of two example from 2.71, to encode the least frequent element
  286. ;; is O(n ^ 2) because that tree has O(n) levels, and encoding the most frequent
  287. ;; element is O(n) because we still have to search through all the elements in the
  288. ;; left branch of the root before finding our element in the right branch.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement