Advertisement
Guest User

Untitled

a guest
Oct 2nd, 2014
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.97 KB | None | 0 0
  1. VERSION = 1.0
  2.  
  3. NUM_BYTES = 256
  4.  
  5. ALL_BYTES = set()
  6.  
  7. # collect all symbols without duplications by using
  8. # Ascii code to convert numbers into symbols
  9. for byte_val in range(NUM_BYTES):
  10. ALL_BYTES.add(chr(byte_val))
  11.  
  12.  
  13. def get_unused_bytes(string):
  14. """
  15. get every unused from the string in the form of ascii code
  16. """
  17. used_bytes = set(string)
  18. # delete used element if it appear in ALL_BYTES
  19. return ALL_BYTES.difference(used_bytes)
  20.  
  21.  
  22. def get_ngrams(string, ngram_size):
  23. """
  24. get every possible string at the len of ngram_size from string
  25. """
  26. result = []
  27. upper_index = len(string) - ngram_size
  28. for index in range(upper_index + 1):
  29. next_ngram = string[index:index + ngram_size]
  30. # use index slicing to sperate the words
  31. # and the length is upto index+ngram_size
  32. result.append(next_ngram)
  33. return result
  34.  
  35.  
  36. def freqs(seq):
  37. """
  38. count frequencies of a word,
  39. seq is the input where the list got from get_ngrams
  40. """
  41. counters = {}
  42. for item in seq:
  43. if item in counters:
  44. # it increments one when the word appear again
  45. counters[item] += 1
  46. else:
  47. # if it's the only one, freq keeps at one
  48. counters[item] = 1
  49. return counters
  50.  
  51.  
  52. def get_second_item(seq):
  53. # get second item, as the first one is seq[0]
  54. return seq[1]
  55.  
  56.  
  57. def sorted_ngrams_by_freq(dict):
  58. """
  59. the only argument implies it's a dictionary
  60. and the aim is to get a list of words which is at the order of frequency
  61. """
  62. items = dict.items()
  63. print items
  64. # get elements from dict in the form of list
  65. sorted_items = sorted(items, key=get_second_item, reverse=True)
  66. # sort it by frequency and show the words only
  67. result = []
  68. for ngram, _freq in sorted_items:
  69. # append every word to result
  70. result.append(ngram)
  71. return result
  72.  
  73. MAX_MAPPINGS = 255
  74.  
  75.  
  76. def make_ngram_encoding(sorted_ngrams, encoding_bytes):
  77. """
  78. the aim of encoding_bytes is a sort of encryption,
  79. just like using symbols "`~=+-_"':;<,>.?/" to
  80. make sorted_ngrams simple and therefore reduce the size
  81. and encoding bytes can also be numbers and chars
  82. """
  83. result = {}
  84. count = 0
  85. #zip the sorted list and target econding symbols
  86. for ngram, encoding_byte in zip(sorted_ngrams, encoding_bytes):
  87. if count >= MAX_MAPPINGS:
  88. # if count exceeds the limit, stop the loop
  89. break
  90. # assign the encoding element to the value of the corresponding key
  91. # which is the element of the sorted list
  92. result[ngram] = encoding_byte
  93. count += 1
  94. return result
  95.  
  96.  
  97. def make_header(ngram_size, encoded_ngrams):
  98. """
  99. usage:
  100. ngram_size standards for the size of the ngram
  101. encoded_ngrams standards for a sorted and encoded dict
  102. so when the function is called,
  103. the result would be a abstract value
  104. """
  105. number_mappings = len(encoded_ngrams)
  106. number_mappings_as_char = chr(number_mappings)
  107. #this gives a ascii code for num_mappings
  108. ngram_size_as_char = chr(ngram_size)
  109. #this gives a ascii code for ngram_size
  110. result = number_mappings_as_char + ngram_size_as_char
  111. #sum up the two ascii codes
  112. for ngram in encoded_ngrams:
  113. #also add the keys of the encoded dict to that result
  114. result += ngram + encoded_ngrams[ngram]
  115. return result
  116.  
  117.  
  118. def make_encoded_string(string, ngram_size, encoded_ngrams):
  119. #further reduce the string by using the symbols to replace the encoded_ngrams,
  120. #one example could be like this:
  121. """make_encoded_string('\x06\x03aad.vds=abc`afc~ewr+ikm_',3,
  122. {'aad': '.', 'vds': '=', 'abc': '`', 'afc': '~',
  123. 'ewr': '+', 'ikm': '_'})
  124. and the result is '\x06\x03..==``~~++__'
  125. where string is the header,
  126. ngram_szie and encoded_nrams have used in previous functions
  127. """
  128. result = ''
  129. index = 0
  130. while index < len(string):
  131. # ngram will contain the head as index initialised as 0
  132. ngram = string[index:index + ngram_size]
  133. if ngram in encoded_ngrams:
  134. # if the statemtn is true, which means the key of encoded_ngrams
  135. # in the string will be replaced by the corresponding value
  136. result += encoded_ngrams[ngram]
  137. print result
  138. index += ngram_size
  139. else:
  140. #if there is an interval of string isn't in ngrams
  141. #add the first element to the reuslt
  142. result += string[index]
  143. index += 1
  144. return result
  145.  
  146. MINIMUM_ENCODING_BYTES = 1
  147.  
  148.  
  149. def compress_file(ngram_size, in_filename, out_filename):
  150. """
  151. the final functon uses all previous function
  152. to reduce the size of a file and output it
  153. """
  154. if ngram_size <= 0:
  155. print("n-gram size must be greater than 0")
  156. return
  157.  
  158. in_file = open(in_filename) # open read the file
  159. contents = in_file.read() # store the file into contents
  160. in_file.close() # colse the file
  161.  
  162. #get unsed bytes at the form of list and assign to encoding _bytes
  163. encoding_bytes = list(get_unused_bytes(contents))
  164. # get the length of the encoding_bytes
  165. num_encoding_bytes = len(encoding_bytes)
  166.  
  167. # assertion process, the num_encoding_bytes cannot be less than 1
  168. if num_encoding_bytes <= MINIMUM_ENCODING_BYTES:
  169. print("Cannot compress file %s" % in_filename)
  170. print("Insufficient unused bytes in file")
  171. print("Found %s unused bytes, but %s are required" %
  172. (num_encoding_bytes, MINIMUM_ENCODING_BYTES))
  173. return
  174.  
  175. # get every possible string at the len of ngram_size
  176. ngrams = get_ngrams(contents, ngram_size)
  177. # get the length
  178. num_ngrams = len(ngrams)
  179.  
  180. # another assertion process, the len cannot be zero
  181. if num_ngrams == 0:
  182. print("Cannot compress file %s" % in_filename)
  183. print("Zero ngrams found, perhaps file is too small?")
  184. return
  185.  
  186. # get frequencies of each sinlge words of the target strings
  187. ngram_freqs = freqs(ngrams)
  188.  
  189. # sort them at the order of frequency and assign those words to a list
  190. sorted_ngrams = sorted_ngrams_by_freq(ngram_freqs)
  191.  
  192. # encoding the list and the encoding_bytes to form a dictinoary
  193. encoded_ngrams = make_ngram_encoding(sorted_ngrams, encoding_bytes)
  194.  
  195. # get a header
  196. header = make_header(ngram_size, encoded_ngrams)
  197.  
  198. #further encoding
  199. encoded_contents = make_encoded_string(contents, ngram_size,
  200. encoded_ngrams)
  201.  
  202. out_file = open(out_filename, 'w') # write a new file
  203. out_file.write(header) # write the header
  204. out_file.write(encoded_contents) # write the contents
  205. out_file.close() # close the file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement