Advertisement
Guest User

Untitled

a guest
Feb 19th, 2020
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.62 KB | None | 0 0
  1. from random import random
  2. from math import log2
  3. import re
  4. import time
  5.  
  6. data_len = 10240
  7. custom_alphabet = ['a', 'b', 'c', 'd']
  8. custom_probability = [0.4, 0.3, 0.2, 0.1]
  9.  
  10.  
  11. class Text:
  12. def __init__(self, filename=None, eq=False, rand=False):
  13. self.contents = ""
  14. self.alphabet = {}
  15. self.pairs = {}
  16. self.alphabet_frq = {}
  17. self.prob_list = []
  18. self.probs = []
  19. self.single_entropy = 0.0
  20. self.single_redundancy = 0.0
  21. self.pairs_entropy = 0.0
  22. self.pairs_redundancy = 0.0
  23. self.filename = filename
  24. self.encoding = {}
  25. self.encoded_text = ""
  26. self.cw_len = 0.0
  27. self.encoding_redundancy = 0.0
  28. if eq or rand:
  29. self.__from_alphabet(eq, rand)
  30. elif filename is not None:
  31. self.__from_file(filename)
  32.  
  33. def __from_alphabet(self, eq=False, rand=False):
  34. if rand:
  35. self.__create_probability_list(custom_probability)
  36. self.filename = 'rand.txt'
  37. self.probs = custom_probability
  38. elif eq:
  39. prob = 1 / len(custom_alphabet)
  40. self.__create_probability_list(prob)
  41. self.filename = 'eq.txt'
  42. self.alphabet = dict(zip(custom_alphabet, self.prob_list))
  43. self.__generate_content()
  44. self.save_content_to_file()
  45. self.__get_pairs(self.contents)
  46.  
  47. def __create_probability_list(self, pr):
  48. pr_list = []
  49. if isinstance(pr, list):
  50. s = 0
  51. for i in range(len(pr)):
  52. s += pr[i]
  53. pr_list.append(s)
  54. elif isinstance(pr, float):
  55. s = 0
  56. for i in range(len(custom_alphabet)):
  57. self.probs.append(pr)
  58. s += pr
  59. pr_list.append(s)
  60. self.prob_list = pr_list
  61.  
  62. def save_content_to_file(self):
  63. with open(self.filename, 'w+') as f:
  64. f.write(self.contents)
  65. f.close()
  66.  
  67. def __generate_content(self):
  68. for i in range(data_len):
  69. r = random()
  70. for k, v in self.alphabet.items():
  71. if r > v:
  72. continue
  73. else:
  74. self.contents += k
  75. break
  76. self.get_alphabet_frq(self.contents)
  77.  
  78. def get_alphabet_frq(self, c):
  79. contents_len = len(c)
  80. cnt = {}
  81. for ch in self.alphabet.keys():
  82. cnt[ch] = 0
  83. for ch in self.contents:
  84. cnt[ch] = cnt[ch] + 1
  85. for k, v in cnt.items():
  86. self.alphabet_frq[k] = v / contents_len
  87.  
  88. def __prepare_file_contents(self):
  89. d = ""
  90. old = ""
  91. regex = re.compile(u'[a-zа-я]')
  92. self.contents = self.contents.lower() # lowercasing all the text
  93. for i in self.contents:
  94. if regex.match(i) is not None:
  95. d += i
  96. else:
  97. d += '.'
  98. while old != d: # replacing all punctuational sequences to one symbol
  99. old = d
  100. d = d.replace("..", ".")
  101. self.contents = d
  102.  
  103. def __create_alphabet(self):
  104. for i in self.contents:
  105. if i not in self.alphabet.keys():
  106. self.alphabet[i] = 0
  107. self.get_alphabet_frq(self.contents)
  108.  
  109. def __from_file(self, filename):
  110. with open(filename, 'r') as f:
  111. self.contents = f.read()
  112. self.__prepare_file_contents()
  113. self.__create_alphabet()
  114. self.__get_pairs(self.contents)
  115.  
  116. def __get_pairs(self, d):
  117. for i in range(len(d) - 2):
  118. if d[i:i+2] not in self.pairs.keys():
  119. self.pairs[d[i:i+2]] = 0
  120. for i in self.pairs.keys():
  121. self.pairs[i] = d.count(i) / (len(d) - 1)
  122.  
  123. def calculate(self):
  124. self.single_entropy = 0.0
  125. self.pairs_entropy = 0.0
  126. self.single_redundancy = 0.0
  127. self.pairs_redundancy = 0.0
  128. for i in self.alphabet_frq.values():
  129. self.single_entropy -= i * log2(i)
  130. self.single_redundancy = 1 - (self.single_entropy / log2(len(self.alphabet_frq)))
  131. for i in self.pairs.values():
  132. self.pairs_entropy -= i * log2(i)
  133. self.pairs_redundancy = (1 - ((self.pairs_entropy / 2) / (log2(len(self.alphabet_frq)))))
  134. self.pairs_entropy = self.pairs_entropy / 2
  135.  
  136. def __fano_step(self, d):
  137. delta = 1.0
  138. i = 1
  139. while True:
  140. first_part = dict(list(d.items())[:i])
  141. second_part = dict(list(d.items())[i:])
  142. s1 = sum(first_part.values())
  143. s2 = sum(second_part.values())
  144. if len(d) == 2:
  145. break
  146. if abs(s1 - s2) < delta:
  147. delta = abs(s1 - s2)
  148. i += 1
  149. else:
  150. first_part = dict(list(d.items())[:i-1])
  151. second_part = dict(list(d.items())[i-1:])
  152. break
  153. for k, v in first_part.items():
  154. if k not in self.encoding.keys():
  155. self.encoding[k] = ""
  156. self.encoding[k] = self.encoding[k] + "0"
  157. for k, v in second_part.items():
  158. if k not in self.encoding.keys():
  159. self.encoding[k] = ""
  160. self.encoding[k] = self.encoding[k] + "1"
  161. if len(first_part) != 1:
  162. self.__fano_step(first_part)
  163. if len(second_part) != 1:
  164. self.__fano_step(second_part)
  165.  
  166. def __encode_text(self):
  167. for i in self.contents:
  168. self.encoded_text += self.encoding[i]
  169.  
  170. def encode(self):
  171. if len(self.probs) == 0:
  172. ordered_alphabet = {k: v for k, v in sorted(self.alphabet_frq.items(),
  173. key=lambda item: item[1], reverse=True)}
  174. else:
  175. ordered_alphabet = {k: v for k, v in sorted(zip(self.alphabet.keys(), self.probs),
  176. key=lambda item: item[1], reverse=True)}
  177. self.__fano_step(ordered_alphabet)
  178. self.__encode_text()
  179. frq = [0, 0]
  180. for i in self.encoded_text:
  181. frq[int(i)] += 1
  182. s = sum(frq)
  183. for i in range(2):
  184. frq[i] = frq[i] / s
  185. e = self.single_entropy
  186. self.alphabet_frq = dict(zip([0, 1], frq))
  187. self.pairs = {}
  188. self.__get_pairs(self.encoded_text)
  189. self.calculate()
  190. for k, v in self.encoding.items():
  191. self.cw_len += (len(v) * ordered_alphabet[k])
  192. self.encoding_redundancy = self.cw_len - e
  193. print(self.encoding)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement