Advertisement
Guest User

Untitled

a guest
Sep 14th, 2019
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.35 KB | None | 0 0
  1. import argparse
  2. from datetime import datetime
  3. import itertools
  4. import json
  5. import functools
  6. import sys
  7.  
  8.  
  9. def show_binary(byte):
  10. return '{:08b}'.format(byte)
  11.  
  12.  
  13. class StreamReader:
  14. def __init__(self, data):
  15. assert type(data) == bytes
  16. self._data = data
  17. self._byte_index = 0
  18. self._byte_total = len(data)
  19. self._bitwise = False
  20. self._bit_buffer = 0
  21. self._bit_pending = 0
  22.  
  23. def has_more(self):
  24. if self._byte_index < self._byte_total:
  25. return True
  26. if self._bitwise and self._bit_pending:
  27. return True
  28. return False
  29.  
  30. def get_byte(self, for_bits=False):
  31. assert not self._bitwise or for_bits
  32. result = self._data[self._byte_index]
  33. self._byte_index += 1
  34. # print(">>> next byte", result)
  35. return result
  36.  
  37. def get_bytes(self, size):
  38. self._byte_index += size
  39. return self._data[self._byte_index - size: self._byte_index]
  40.  
  41. def get_int(self, size):
  42. assert not self._bitwise
  43. return functools.reduce(
  44. lambda result, item: result * 256 + item,
  45. [self.get_byte() for _ in range(size)][::-1],
  46. 0
  47. )
  48.  
  49. def get_string(self):
  50. assert not self._bitwise
  51. result = ""
  52. while True:
  53. char = self.get_byte()
  54. if char == 0:
  55. return result
  56. result += chr(char)
  57.  
  58. def set_bitwise(self, value):
  59. if self._bitwise is True and value is False:
  60. assert self._bit_pending == 0
  61. self._bitwise = value
  62.  
  63. def get_bit(self):
  64. assert self._bitwise
  65. if not self._bit_pending:
  66. self._bit_buffer = self.get_byte(for_bits=True)
  67. self._bit_pending = 8
  68. result = self._bit_buffer & 1
  69. self._bit_buffer >>= 1
  70. self._bit_pending -= 1
  71. return result
  72.  
  73. def get_bits(self, size, reverse=False):
  74. return functools.reduce(
  75. lambda result, item: item | result << 1,
  76. [self.get_bit() for _ in range(size)][::-1 if reverse else 1],
  77. 0,
  78. )
  79.  
  80. def discard_remaining_bits(self):
  81. self._bit_pending = 0
  82.  
  83. def get_remaining(self):
  84. return self._data[self._byte_index:]
  85.  
  86.  
  87. class HuffmanDecoder:
  88. def __init__(self, bit_lengths):
  89. bit_length_to_count = {0: 0}
  90. max_length = 0
  91. for bit_length in bit_lengths:
  92. max_length = max(max_length, bit_length)
  93. if bit_length not in bit_length_to_count:
  94. bit_length_to_count[bit_length] = 0
  95. bit_length_to_count[bit_length] += 1
  96.  
  97. code = 0
  98. bit_length_to_next_code = {}
  99. for bit_length in range(1, max_length + 1):
  100. # TODO: Figure out what this does.
  101. code = (code + bit_length_to_count.get(bit_length - 1, 0)) << 1;
  102. bit_length_to_next_code[bit_length] = code;
  103.  
  104. codes = []
  105. for bit_length in bit_lengths:
  106. if bit_length == 0:
  107. codes.append(None)
  108. else:
  109. codes.append(bit_length_to_next_code[bit_length])
  110. bit_length_to_next_code[bit_length] += 1
  111.  
  112. self.tree = {}
  113. for index, code in enumerate(codes):
  114. if code is None:
  115. continue
  116. code_str = '{:0{width}b}'.format(code, width=bit_lengths[index])
  117. current = self.tree
  118. previous = None
  119. for char in code_str:
  120. char = int(char)
  121. if char not in current:
  122. current[char] = {}
  123. previous = current
  124. current = current[char]
  125. previous[char] = index
  126.  
  127. def extract(self, stream):
  128. current = self.tree
  129. chars = []
  130. while type(current) == dict:
  131. char = stream.get_bit()
  132. chars.append(char)
  133. current = current[char]
  134. print(chars)
  135. return current
  136.  
  137.  
  138.  
  139. class GzipDecompress:
  140. def __init__(self, data):
  141. self._stream = StreamReader(data)
  142.  
  143. def run(self):
  144. self.result = []
  145. while self._stream.has_more():
  146. self._read_gzip_block()
  147. break
  148.  
  149. def _read_gzip_block(self):
  150. # Reference = https://tools.ietf.org/html/rfc1952
  151. assert self._stream.get_byte() == 31
  152. assert self._stream.get_byte() == 139
  153. compression_method = self._stream.get_byte()
  154. flags = self._stream.get_byte()
  155. print("flags", show_binary(flags))
  156. has_filename = bool(flags & (1 << 3))
  157. has_comment = bool(flags & (1 << 4))
  158. modified_time = self._stream.get_int(size=4)
  159. print("mtime", datetime.fromtimestamp(modified_time))
  160. extra_flags = self._stream.get_byte()
  161. print("extra_flags", show_binary(extra_flags))
  162. operating_system = self._stream.get_byte()
  163. filename = self._stream.get_string() if has_filename else None
  164. print("filename", filename)
  165. comment = self._stream.get_string() if has_comment else None
  166. print("comment", comment)
  167. self._read_deflate_block()
  168.  
  169. def _read_deflate_block(self):
  170. # Reference = https://tools.ietf.org/html/rfc1951
  171. print("!!!", self._stream.get_remaining())
  172. self._stream.set_bitwise(True)
  173. is_final_block = self._stream.get_bit()
  174. print("is_final_block", is_final_block)
  175. block_type = self._stream.get_bits(2)
  176. print("block_type", block_type)
  177. if block_type == 0:
  178. self._stream.discard_remaining_bits()
  179. length = self._stream.get_int(size=2)
  180. self._stream.get_int(size=2) # one's compliment of length
  181. self.result.extend(self._stream.get_types(length))
  182. elif block_type == 1:
  183. self._read_fixed_huffman_codes()
  184. elif block_type == 2:
  185. self._read_dynamic_huffman_codes()
  186. else:
  187. raise NotImplementedError()
  188.  
  189. def _read_fixed_huffman_codes(self):
  190. bit_lengths = []
  191. symbol = 0
  192. while symbol <= 143:
  193. bit_lengths.append(8)
  194. symbol += 1
  195. while symbol <= 255:
  196. bit_lengths.append(9)
  197. symbol += 1
  198. while symbol <= 279:
  199. bit_lengths.append(7)
  200. symbol += 1
  201. while symbol <= 287:
  202. bit_lengths.append(8)
  203. symbol += 1
  204. # bit_lengths = [3, 3, 3, 3, 3, 2, 4, 4] # example for testing
  205. # for this example, the generated tree matches what is shown in rfc1951
  206. decoder = HuffmanDecoder(bit_lengths)
  207. while True:
  208. value = decoder.extract(self._stream)
  209. print(">>>", value)
  210. if value == 256:
  211. break
  212.  
  213. def _read_dynamic_huffman_codes(self):
  214. literal_count = self._stream.get_bits(5, reverse=True) + 257
  215. distance_count = self._stream.get_bits(5, reverse=True) + 1
  216. code_count = self._stream.get_bits(4, reverse=True) + 4
  217. print(literal_count, distance_count, code_count)
  218. bit_lengths = []
  219. for _ in range(code_count):
  220. bit_lengths.append(self._stream.get_bits(3, reverse=True))
  221.  
  222. ordering = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
  223. print(bit_lengths)
  224. print(ordering)
  225.  
  226. bit_lengths = [
  227. pair[0]
  228. for pair in sorted(
  229. itertools.zip_longest(bit_lengths, ordering, fillvalue=0),
  230. key=lambda pair: pair[1],
  231. )
  232. ]
  233. print(bit_lengths)
  234. decoder = HuffmanDecoder(bit_lengths)
  235.  
  236. print(json.dumps(decoder.tree, indent=4))
  237. while True:
  238. value = decoder.extract(self._stream)
  239. print(">>>", value)
  240. if value == 256:
  241. break
  242.  
  243.  
  244. try:
  245. while True:
  246. print(self._stream.get_bit(), end="")
  247. except IndexError:
  248. print("\n")
  249. raise NotImplementedError()
  250.  
  251.  
  252. if __name__ == "__main__":
  253. parser = argparse.ArgumentParser()
  254. parser.add_argument("filename", help="Name of input file.")
  255. options = parser.parse_args()
  256.  
  257. with open(options.filename, "rb") as file_handle:
  258. data = file_handle.read()
  259. print(GzipDecompress(data).run())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement