Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from random import random
- from math import log2
- import re
- import time
- data_len = 10240
- custom_alphabet = ['a', 'b', 'c', 'd']
- custom_probability = [0.4, 0.3, 0.2, 0.1]
- class Text:
- def __init__(self, filename=None, eq=False, rand=False):
- self.contents = ""
- self.alphabet = {}
- self.pairs = {}
- self.alphabet_frq = {}
- self.prob_list = []
- self.probs = []
- self.single_entropy = 0.0
- self.single_redundancy = 0.0
- self.pairs_entropy = 0.0
- self.pairs_redundancy = 0.0
- self.filename = filename
- self.encoding = {}
- self.encoded_text = ""
- self.cw_len = 0.0
- self.encoding_redundancy = 0.0
- if eq or rand:
- self.__from_alphabet(eq, rand)
- elif filename is not None:
- self.__from_file(filename)
- def __from_alphabet(self, eq=False, rand=False):
- if rand:
- self.__create_probability_list(custom_probability)
- self.filename = 'rand.txt'
- self.probs = custom_probability
- elif eq:
- prob = 1 / len(custom_alphabet)
- self.__create_probability_list(prob)
- self.filename = 'eq.txt'
- self.alphabet = dict(zip(custom_alphabet, self.prob_list))
- self.__generate_content()
- self.save_content_to_file()
- self.__get_pairs(self.contents)
- def __create_probability_list(self, pr):
- pr_list = []
- if isinstance(pr, list):
- s = 0
- for i in range(len(pr)):
- s += pr[i]
- pr_list.append(s)
- elif isinstance(pr, float):
- s = 0
- for i in range(len(custom_alphabet)):
- self.probs.append(pr)
- s += pr
- pr_list.append(s)
- self.prob_list = pr_list
- def save_content_to_file(self):
- with open(self.filename, 'w+') as f:
- f.write(self.contents)
- f.close()
- def __generate_content(self):
- for i in range(data_len):
- r = random()
- for k, v in self.alphabet.items():
- if r > v:
- continue
- else:
- self.contents += k
- break
- self.get_alphabet_frq(self.contents)
- def get_alphabet_frq(self, c):
- contents_len = len(c)
- cnt = {}
- for ch in self.alphabet.keys():
- cnt[ch] = 0
- for ch in self.contents:
- cnt[ch] = cnt[ch] + 1
- for k, v in cnt.items():
- self.alphabet_frq[k] = v / contents_len
- def __prepare_file_contents(self):
- d = ""
- old = ""
- regex = re.compile(u'[a-zа-я]')
- self.contents = self.contents.lower() # lowercasing all the text
- for i in self.contents:
- if regex.match(i) is not None:
- d += i
- else:
- d += '.'
- while old != d: # replacing all punctuational sequences to one symbol
- old = d
- d = d.replace("..", ".")
- self.contents = d
- def __create_alphabet(self):
- for i in self.contents:
- if i not in self.alphabet.keys():
- self.alphabet[i] = 0
- self.get_alphabet_frq(self.contents)
- def __from_file(self, filename):
- with open(filename, 'r') as f:
- self.contents = f.read()
- self.__prepare_file_contents()
- self.__create_alphabet()
- self.__get_pairs(self.contents)
- def __get_pairs(self, d):
- for i in range(len(d) - 2):
- if d[i:i+2] not in self.pairs.keys():
- self.pairs[d[i:i+2]] = 0
- for i in self.pairs.keys():
- self.pairs[i] = d.count(i) / (len(d) - 1)
- def calculate(self):
- self.single_entropy = 0.0
- self.pairs_entropy = 0.0
- self.single_redundancy = 0.0
- self.pairs_redundancy = 0.0
- for i in self.alphabet_frq.values():
- self.single_entropy -= i * log2(i)
- self.single_redundancy = 1 - (self.single_entropy / log2(len(self.alphabet_frq)))
- for i in self.pairs.values():
- self.pairs_entropy -= i * log2(i)
- self.pairs_redundancy = (1 - ((self.pairs_entropy / 2) / (log2(len(self.alphabet_frq)))))
- self.pairs_entropy = self.pairs_entropy / 2
- def __fano_step(self, d):
- delta = 1.0
- i = 1
- while True:
- first_part = dict(list(d.items())[:i])
- second_part = dict(list(d.items())[i:])
- s1 = sum(first_part.values())
- s2 = sum(second_part.values())
- if len(d) == 2:
- break
- if abs(s1 - s2) < delta:
- delta = abs(s1 - s2)
- i += 1
- else:
- first_part = dict(list(d.items())[:i-1])
- second_part = dict(list(d.items())[i-1:])
- break
- for k, v in first_part.items():
- if k not in self.encoding.keys():
- self.encoding[k] = ""
- self.encoding[k] = self.encoding[k] + "0"
- for k, v in second_part.items():
- if k not in self.encoding.keys():
- self.encoding[k] = ""
- self.encoding[k] = self.encoding[k] + "1"
- if len(first_part) != 1:
- self.__fano_step(first_part)
- if len(second_part) != 1:
- self.__fano_step(second_part)
- def __encode_text(self):
- for i in self.contents:
- self.encoded_text += self.encoding[i]
- def encode(self):
- if len(self.probs) == 0:
- ordered_alphabet = {k: v for k, v in sorted(self.alphabet_frq.items(),
- key=lambda item: item[1], reverse=True)}
- else:
- ordered_alphabet = {k: v for k, v in sorted(zip(self.alphabet.keys(), self.probs),
- key=lambda item: item[1], reverse=True)}
- self.__fano_step(ordered_alphabet)
- self.__encode_text()
- frq = [0, 0]
- for i in self.encoded_text:
- frq[int(i)] += 1
- s = sum(frq)
- for i in range(2):
- frq[i] = frq[i] / s
- e = self.single_entropy
- self.alphabet_frq = dict(zip([0, 1], frq))
- self.pairs = {}
- self.__get_pairs(self.encoded_text)
- self.calculate()
- for k, v in self.encoding.items():
- self.cw_len += (len(v) * ordered_alphabet[k])
- self.encoding_redundancy = self.cw_len - e
- print(self.encoding)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement