Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import Levenshtein
- from math import log
- import sqlite3
- import interactive_spellchecker
- import os
- r'''
- def another_init():
- conn = sqlite3.connect(r'C:\Users\pervu
- \Desktop\levenshtein_speller\speller.db')
- cursor = conn.cursor()
- cursor.execute("CREATE VIRTUAL TABLE freq USING fts5(names)")
- with open(r'C:\Users\pervu\Desktop\Python\hagen_freq_desc\freq.txt') as f:
- for l in f:
- parts = l.split(" | ")
- lem = parts[3]
- cursor.execute("insert into freq values(?)", (lem,))
- conn.commit()
- conn.close()
- '''
- def main():
- # не хочу просто перебор
- # реализация закона Цифра
- # (частота слова обратно пропорциональна его порядковому номеру)
- # (можно не хранить частоту)
- # st='шикарноможноделитьноиногданельзяаявообщекошечка'
- st = input()
- sp = Splitter()
- res = sp.infer_spaces(st)
- print(res)
- return res
- class Splitter():
- def __init__(self):
- st = r'speller.db'
- conn = sqlite3.connect(os.path.join(
- os.path.dirname(os.path.abspath(__file__)), st))
- cursor = conn.cursor()
- cursor.execute('SELECT * FROM freq')
- list = cursor.fetchall()
- conn.commit()
- conn.close()
- words = interactive_spellchecker.appropriate_list(list)
- self.wordcost = dict((k, log((i + 1) * log(len(words))))
- for i, k in enumerate(words))
- self.maxword = max(len(x) for x in words)
- def infer_spaces(self, s):
- def best_match(i):
- candidates = enumerate(reversed(cost[max(0, i - self.maxword):i]))
- return min(
- (c + self.wordcost.get(s[i - k - 1:i],
- 9e999), k + 1) for k, c in candidates)
- #
- cost = [0]
- for i in range(1, len(s) + 1):
- c, k = best_match(i)
- cost.append(c)
- #
- out = []
- i = len(s)
- while i > 0:
- c, k = best_match(i)
- assert c == cost[i]
- out.append(s[i - k:i])
- i -= k
- return " ".join(reversed(out))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement