Untitled

VERSION = 1.0

NUM_BYTES = 256

ALL_BYTES = set()

# collect all symbols without duplications by using
# Ascii code to convert numbers into symbols
for byte_val in range(NUM_BYTES):
    ALL_BYTES.add(chr(byte_val))


def get_unused_bytes(string):
    """
    get every unused from the string in the form of ascii code
    """
    used_bytes = set(string)
    # delete used element if it appear in ALL_BYTES
    return ALL_BYTES.difference(used_bytes)


def get_ngrams(string, ngram_size):
    """
    get every possible string at the len of ngram_size from string
    """
    result = []
    upper_index = len(string) - ngram_size
    for index in range(upper_index + 1):
        next_ngram = string[index:index + ngram_size]
        # use index slicing to sperate the words
        # and the length is upto index+ngram_size
        result.append(next_ngram)
    return result


def freqs(seq):
    """
    count frequencies of a word,
    seq is the input where the list got from get_ngrams
    """
    counters = {}
    for item in seq:
        if item in counters:
            # it increments one when the word appear again
            counters[item] += 1
        else:
            # if it's the only one, freq keeps at one
            counters[item] = 1
    return counters


def get_second_item(seq):
    # get second item, as the first one is seq[0]
    return seq[1]


def sorted_ngrams_by_freq(dict):
    """
    the only argument implies it's a dictionary
    and the aim is to get a list of words which is at the order of frequency
    """
    items = dict.items()
    print items
    # get elements from dict in the form of list
    sorted_items = sorted(items, key=get_second_item, reverse=True)
    # sort it by frequency and show the words only
    result = []
    for ngram, _freq in sorted_items:
        # append every word to result
        result.append(ngram)
    return result

MAX_MAPPINGS = 255


def make_ngram_encoding(sorted_ngrams, encoding_bytes):
    """
    the aim of encoding_bytes is a sort of encryption,
    just like using symbols "`~=+-_"':;<,>.?/" to
    make sorted_ngrams simple and therefore reduce the size
    and encoding bytes can also be numbers and chars
    """
    result = {}
    count = 0
    #zip the sorted list and target econding symbols
    for ngram, encoding_byte in zip(sorted_ngrams, encoding_bytes):
        if count >= MAX_MAPPINGS:
            # if count exceeds the limit, stop the loop
            break
        # assign the encoding element to the value of the corresponding key
        # which is the element of the sorted list
        result[ngram] = encoding_byte
        count += 1
    return result


def make_header(ngram_size, encoded_ngrams):
    """
    usage:
    ngram_size standards for the size of the ngram
    encoded_ngrams standards for a sorted and encoded dict
    so when the function is called,
    the result would be a abstract value
    """
    number_mappings = len(encoded_ngrams)
    number_mappings_as_char = chr(number_mappings)
    #this gives a ascii code for num_mappings
    ngram_size_as_char = chr(ngram_size)
    #this gives a ascii code for ngram_size
    result = number_mappings_as_char + ngram_size_as_char
    #sum up the two ascii codes
    for ngram in encoded_ngrams:
        #also add the keys of the encoded dict to that result
        result += ngram + encoded_ngrams[ngram]
    return result


def make_encoded_string(string, ngram_size, encoded_ngrams):
    #further reduce the string by using the symbols to replace the encoded_ngrams,
    #one example could be like this:
    """make_encoded_string('\x06\x03aad.vds=abc`afc~ewr+ikm_',3,
            {'aad': '.', 'vds': '=', 'abc': '`', 'afc': '~',
                                             'ewr': '+', 'ikm': '_'})
    and the result is   '\x06\x03..==``~~++__'
    where string is the header,
    ngram_szie and encoded_nrams have used in previous functions
    """
    result  = ''
    index = 0
    while index < len(string):
        # ngram will contain the head as index initialised as 0
        ngram = string[index:index + ngram_size]
        if ngram in encoded_ngrams:
            # if the statemtn is true, which means the key of encoded_ngrams
            # in the string will  be replaced by the corresponding value
            result += encoded_ngrams[ngram]
            print result
            index += ngram_size
        else:
            #if there is an interval of string isn't in ngrams
            #add the first element to the reuslt
            result += string[index]
            index += 1
    return result

MINIMUM_ENCODING_BYTES = 1


def compress_file(ngram_size, in_filename, out_filename):
    """
    the final functon uses all previous function
     to reduce the size of a file and output it
    """
    if ngram_size <= 0:
        print("n-gram size must be greater than 0")
        return

    in_file = open(in_filename)    # open read the file
    contents = in_file.read()      # store the file into contents
    in_file.close()                # colse the file

    #get unsed bytes at the form of list and assign to encoding _bytes
    encoding_bytes = list(get_unused_bytes(contents))
    # get the length of the encoding_bytes
    num_encoding_bytes = len(encoding_bytes)

    # assertion process, the num_encoding_bytes cannot be less than 1
    if num_encoding_bytes <= MINIMUM_ENCODING_BYTES:
        print("Cannot compress file %s" % in_filename)
        print("Insufficient unused bytes in file")
        print("Found %s unused bytes, but %s are required" %
                 (num_encoding_bytes, MINIMUM_ENCODING_BYTES))
        return

    # get every possible string at the len of ngram_size
    ngrams = get_ngrams(contents, ngram_size)
    # get the length
    num_ngrams = len(ngrams)

    # another assertion process, the len cannot be zero
    if num_ngrams == 0:
        print("Cannot compress file %s" % in_filename)
        print("Zero ngrams found, perhaps file is too small?")
        return

    # get frequencies of each sinlge words of the target strings
    ngram_freqs = freqs(ngrams)

    # sort them at the order of frequency and assign those words to a list
    sorted_ngrams = sorted_ngrams_by_freq(ngram_freqs)

    # encoding the list and the encoding_bytes to form a dictinoary
    encoded_ngrams = make_ngram_encoding(sorted_ngrams, encoding_bytes)

    # get a header
    header = make_header(ngram_size, encoded_ngrams)

    #further encoding
    encoded_contents = make_encoded_string(contents, ngram_size,
                           encoded_ngrams)

    out_file = open(out_filename, 'w')    # write a new file
    out_file.write(header)                # write the header
    out_file.write(encoded_contents)      # write the contents
    out_file.close()                      # close the file