Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from collections import Counter
- from math import log2
- def prepare_ru(s):
- A = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'.lower() + ' '
- s = ''.join(e for e in s.replace('\n', ' ').replace('\t', ' ').lower() if e in A)
- s = ' '.join(s.split())
- return s
- def evaluate(a, d):
- n = sum(d.values()) # d.total()
- h0, h1 = .0, .0
- for k, v in d.items():
- h0 -= v * log2(v / n)
- h1 += len(k)
- # print(f'{h0=} {h1=}')
- return h0 + h1 * log2(33)
- def sequence_join_slow(a, e0, e1):
- i = 0
- while i + 1 < len(a):
- if a[i] == e0 and a[i + 1] == e1:
- a[i] += a[i + 1]
- a.pop(i + 1)
- i += 1
- return a
- def sequence_join(a, e0, e1):
- b = []
- i = 0
- while i < len(a):
- if i + 1 < len(a) and a[i] == e0 and a[i + 1] == e1:
- b.append(e0 + e1)
- i += 2
- else:
- b.append(a[i])
- i += 1
- return b
- def evaluate_join(a, e0, e1):
- a = a.copy()
- a = sequence_join(a, e0, e1)
- d = Counter(a)
- return evaluate(a, d)
- def try_relax(a, d, ev0, /, min_delta, min_cnt, strict_greedy):
- d2, d2n = Counter(), len(a) - 1
- for e0, e1 in zip(a[:-1], a[1:]):
- d2[(e0, e1)] += 1
- d2ev = [(evaluate_join(a, e0, e1), (e0, e1), cnt01)
- for (e0, e1), cnt01 in d2.most_common()
- if cnt01 >= min_cnt]
- d2ev.sort()
- for _, (e0, e1), cnt01 in d2ev:
- if e0 + e1 in d:
- continue
- ev1 = evaluate_join(a, e0, e1)
- if ev0 - ev1 < min_delta:
- continue
- a = sequence_join(a, e0, e1)
- d = Counter(a)
- ev1 = evaluate(a, d)
- print(f'{ev1:9.3f} {ev0 - ev1:9.3f} {e0:>16} {e1:>16} {cnt01:6} {len(a):9} {len(d):9}')
- ev0 = ev1
- if strict_greedy:
- break
- return a, d, ev0
- def rebuild(s, d):
- a, n = [], len(s)
- i = 0
- while i < n:
- for k in range(min(32, n - i), 0, -1):
- if s[i:i + k] in d:
- a.append(s[i:i + k])
- i += k
- break
- else:
- print('fail non-recursive', i, '/', n)
- break
- return a
- def find_word_split(s, /,
- min_delta=20,
- min_cnt=1,
- strict_greedy=False
- ):
- a = list(s)
- d = Counter(a)
- ev0 = evaluate(a, d)
- print(f'parameters {min_delta=} {min_cnt=}')
- print(f'ev={ev0:.6f} words={len(a)} dict={len(d)}')
- print(f'{"ev":>9} {"delta_ev":>9} {"left":>16} {"right":>16} {"cnt":>6} {"words":>9} {"dict":>9}')
- while True:
- a, d, ev1 = try_relax(a, d, ev0,
- min_delta=min_delta, min_cnt=min_cnt, strict_greedy=strict_greedy)
- if ev1 >= ev0 - 1e-6:
- break
- ev0 = ev1
- # print(f'b ev={ev0:.6f} words={len(a)} dict={len(d)}')
- # a = rebuild(s, d)
- # d = Counter(a)
- # ev0 = evaluate(a, d)
- # print(f'a ev={ev0:.6f} words={len(a)} dict={len(d)}')
- print(f'ev={ev0:.6f} words={len(a)} dict={len(d)}')
- return ' '.join(a)
- def main():
- # filename = '671136.txt'
- filename = 'warandpeace.txt'
- with open(filename, 'rt', encoding='utf-8') as f:
- s = f.read()
- s = prepare_ru(s)
- print(len(s))
- s = s[:100000]
- t = ''.join(s.split())
- r = find_word_split(t)
- print(r)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement