Advertisement
gchebanov

word autiosplit

Jun 14th, 2022
1,299
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.49 KB | None | 0 0
  1. from collections import Counter
  2. from math import log2
  3.  
  4.  
  5. def prepare_ru(s):
  6.     A = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'.lower() + ' '
  7.     s = ''.join(e for e in s.replace('\n', ' ').replace('\t', ' ').lower() if e in A)
  8.     s = ' '.join(s.split())
  9.     return s
  10.  
  11.  
  12. def evaluate(a, d):
  13.     n = sum(d.values())  # d.total()
  14.     h0, h1 = .0, .0
  15.     for k, v in d.items():
  16.         h0 -= v * log2(v / n)
  17.         h1 += len(k)
  18.     # print(f'{h0=} {h1=}')
  19.     return h0 + h1 * log2(33)
  20.  
  21.  
  22. def sequence_join_slow(a, e0, e1):
  23.     i = 0
  24.     while i + 1 < len(a):
  25.         if a[i] == e0 and a[i + 1] == e1:
  26.             a[i] += a[i + 1]
  27.             a.pop(i + 1)
  28.         i += 1
  29.     return a
  30.  
  31.  
  32. def sequence_join(a, e0, e1):
  33.     b = []
  34.     i = 0
  35.     while i < len(a):
  36.         if i + 1 < len(a) and a[i] == e0 and a[i + 1] == e1:
  37.             b.append(e0 + e1)
  38.             i += 2
  39.         else:
  40.             b.append(a[i])
  41.             i += 1
  42.     return b
  43.  
  44.  
  45. def evaluate_join(a, e0, e1):
  46.     a = a.copy()
  47.     a = sequence_join(a, e0, e1)
  48.     d = Counter(a)
  49.     return evaluate(a, d)
  50.  
  51.  
  52. def try_relax(a, d, ev0, /, min_delta, min_cnt, strict_greedy):
  53.     d2, d2n = Counter(), len(a) - 1
  54.     for e0, e1 in zip(a[:-1], a[1:]):
  55.         d2[(e0, e1)] += 1
  56.     d2ev = [(evaluate_join(a, e0, e1), (e0, e1), cnt01)
  57.             for (e0, e1), cnt01 in d2.most_common()
  58.             if cnt01 >= min_cnt]
  59.     d2ev.sort()
  60.     for _, (e0, e1), cnt01 in d2ev:
  61.         if e0 + e1 in d:
  62.             continue
  63.         ev1 = evaluate_join(a, e0, e1)
  64.         if ev0 - ev1 < min_delta:
  65.             continue
  66.         a = sequence_join(a, e0, e1)
  67.         d = Counter(a)
  68.         ev1 = evaluate(a, d)
  69.         print(f'{ev1:9.3f} {ev0 - ev1:9.3f} {e0:>16} {e1:>16} {cnt01:6} {len(a):9} {len(d):9}')
  70.         ev0 = ev1
  71.         if strict_greedy:
  72.             break
  73.     return a, d, ev0
  74.  
  75.  
  76. def rebuild(s, d):
  77.     a, n = [], len(s)
  78.     i = 0
  79.     while i < n:
  80.         for k in range(min(32, n - i), 0, -1):
  81.             if s[i:i + k] in d:
  82.                 a.append(s[i:i + k])
  83.                 i += k
  84.                 break
  85.         else:
  86.             print('fail non-recursive', i, '/', n)
  87.             break
  88.     return a
  89.  
  90.  
  91. def find_word_split(s, /,
  92.                     min_delta=20,
  93.                     min_cnt=1,
  94.                     strict_greedy=False
  95.                     ):
  96.     a = list(s)
  97.     d = Counter(a)
  98.     ev0 = evaluate(a, d)
  99.     print(f'parameters {min_delta=} {min_cnt=}')
  100.     print(f'ev={ev0:.6f} words={len(a)} dict={len(d)}')
  101.     print(f'{"ev":>9} {"delta_ev":>9} {"left":>16} {"right":>16} {"cnt":>6} {"words":>9} {"dict":>9}')
  102.     while True:
  103.         a, d, ev1 = try_relax(a, d, ev0,
  104.                               min_delta=min_delta, min_cnt=min_cnt, strict_greedy=strict_greedy)
  105.         if ev1 >= ev0 - 1e-6:
  106.             break
  107.         ev0 = ev1
  108.  
  109.         # print(f'b ev={ev0:.6f} words={len(a)} dict={len(d)}')
  110.         # a = rebuild(s, d)
  111.         # d = Counter(a)
  112.         # ev0 = evaluate(a, d)
  113.         # print(f'a ev={ev0:.6f} words={len(a)} dict={len(d)}')
  114.  
  115.     print(f'ev={ev0:.6f} words={len(a)} dict={len(d)}')
  116.     return ' '.join(a)
  117.  
  118.  
  119. def main():
  120.     # filename = '671136.txt'
  121.     filename = 'warandpeace.txt'
  122.     with open(filename, 'rt', encoding='utf-8') as f:
  123.         s = f.read()
  124.     s = prepare_ru(s)
  125.     print(len(s))
  126.     s = s[:100000]
  127.     t = ''.join(s.split())
  128.     r = find_word_split(t)
  129.     print(r)
  130.  
  131.  
  132. if __name__ == '__main__':
  133.     main()
  134.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement