Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- VERSION = 1.0
- NUM_BYTES = 256
- ALL_BYTES = set()
- # collect all symbols without duplications by using
- # Ascii code to convert numbers into symbols
- for byte_val in range(NUM_BYTES):
- ALL_BYTES.add(chr(byte_val))
- def get_unused_bytes(string):
- """
- get every unused from the string in the form of ascii code
- """
- used_bytes = set(string)
- # delete used element if it appear in ALL_BYTES
- return ALL_BYTES.difference(used_bytes)
- def get_ngrams(string, ngram_size):
- """
- get every possible string at the len of ngram_size from string
- """
- result = []
- upper_index = len(string) - ngram_size
- for index in range(upper_index + 1):
- next_ngram = string[index:index + ngram_size]
- # use index slicing to sperate the words
- # and the length is upto index+ngram_size
- result.append(next_ngram)
- return result
- def freqs(seq):
- """
- count frequencies of a word,
- seq is the input where the list got from get_ngrams
- """
- counters = {}
- for item in seq:
- if item in counters:
- # it increments one when the word appear again
- counters[item] += 1
- else:
- # if it's the only one, freq keeps at one
- counters[item] = 1
- return counters
- def get_second_item(seq):
- # get second item, as the first one is seq[0]
- return seq[1]
- def sorted_ngrams_by_freq(dict):
- """
- the only argument implies it's a dictionary
- and the aim is to get a list of words which is at the order of frequency
- """
- items = dict.items()
- print items
- # get elements from dict in the form of list
- sorted_items = sorted(items, key=get_second_item, reverse=True)
- # sort it by frequency and show the words only
- result = []
- for ngram, _freq in sorted_items:
- # append every word to result
- result.append(ngram)
- return result
- MAX_MAPPINGS = 255
- def make_ngram_encoding(sorted_ngrams, encoding_bytes):
- """
- the aim of encoding_bytes is a sort of encryption,
- just like using symbols "`~=+-_"':;<,>.?/" to
- make sorted_ngrams simple and therefore reduce the size
- and encoding bytes can also be numbers and chars
- """
- result = {}
- count = 0
- #zip the sorted list and target econding symbols
- for ngram, encoding_byte in zip(sorted_ngrams, encoding_bytes):
- if count >= MAX_MAPPINGS:
- # if count exceeds the limit, stop the loop
- break
- # assign the encoding element to the value of the corresponding key
- # which is the element of the sorted list
- result[ngram] = encoding_byte
- count += 1
- return result
- def make_header(ngram_size, encoded_ngrams):
- """
- usage:
- ngram_size standards for the size of the ngram
- encoded_ngrams standards for a sorted and encoded dict
- so when the function is called,
- the result would be a abstract value
- """
- number_mappings = len(encoded_ngrams)
- number_mappings_as_char = chr(number_mappings)
- #this gives a ascii code for num_mappings
- ngram_size_as_char = chr(ngram_size)
- #this gives a ascii code for ngram_size
- result = number_mappings_as_char + ngram_size_as_char
- #sum up the two ascii codes
- for ngram in encoded_ngrams:
- #also add the keys of the encoded dict to that result
- result += ngram + encoded_ngrams[ngram]
- return result
- def make_encoded_string(string, ngram_size, encoded_ngrams):
- #further reduce the string by using the symbols to replace the encoded_ngrams,
- #one example could be like this:
- """make_encoded_string('\x06\x03aad.vds=abc`afc~ewr+ikm_',3,
- {'aad': '.', 'vds': '=', 'abc': '`', 'afc': '~',
- 'ewr': '+', 'ikm': '_'})
- and the result is '\x06\x03..==``~~++__'
- where string is the header,
- ngram_szie and encoded_nrams have used in previous functions
- """
- result = ''
- index = 0
- while index < len(string):
- # ngram will contain the head as index initialised as 0
- ngram = string[index:index + ngram_size]
- if ngram in encoded_ngrams:
- # if the statemtn is true, which means the key of encoded_ngrams
- # in the string will be replaced by the corresponding value
- result += encoded_ngrams[ngram]
- print result
- index += ngram_size
- else:
- #if there is an interval of string isn't in ngrams
- #add the first element to the reuslt
- result += string[index]
- index += 1
- return result
- MINIMUM_ENCODING_BYTES = 1
- def compress_file(ngram_size, in_filename, out_filename):
- """
- the final functon uses all previous function
- to reduce the size of a file and output it
- """
- if ngram_size <= 0:
- print("n-gram size must be greater than 0")
- return
- in_file = open(in_filename) # open read the file
- contents = in_file.read() # store the file into contents
- in_file.close() # colse the file
- #get unsed bytes at the form of list and assign to encoding _bytes
- encoding_bytes = list(get_unused_bytes(contents))
- # get the length of the encoding_bytes
- num_encoding_bytes = len(encoding_bytes)
- # assertion process, the num_encoding_bytes cannot be less than 1
- if num_encoding_bytes <= MINIMUM_ENCODING_BYTES:
- print("Cannot compress file %s" % in_filename)
- print("Insufficient unused bytes in file")
- print("Found %s unused bytes, but %s are required" %
- (num_encoding_bytes, MINIMUM_ENCODING_BYTES))
- return
- # get every possible string at the len of ngram_size
- ngrams = get_ngrams(contents, ngram_size)
- # get the length
- num_ngrams = len(ngrams)
- # another assertion process, the len cannot be zero
- if num_ngrams == 0:
- print("Cannot compress file %s" % in_filename)
- print("Zero ngrams found, perhaps file is too small?")
- return
- # get frequencies of each sinlge words of the target strings
- ngram_freqs = freqs(ngrams)
- # sort them at the order of frequency and assign those words to a list
- sorted_ngrams = sorted_ngrams_by_freq(ngram_freqs)
- # encoding the list and the encoding_bytes to form a dictinoary
- encoded_ngrams = make_ngram_encoding(sorted_ngrams, encoding_bytes)
- # get a header
- header = make_header(ngram_size, encoded_ngrams)
- #further encoding
- encoded_contents = make_encoded_string(contents, ngram_size,
- encoded_ngrams)
- out_file = open(out_filename, 'w') # write a new file
- out_file.write(header) # write the header
- out_file.write(encoded_contents) # write the contents
- out_file.close() # close the file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement