Advertisement
Guest User

Untitled

a guest
Mar 29th, 2017
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.59 KB | None | 0 0
  1. # coding=utf-8
  2.  
  3. from collections import Counter, defaultdict
  4. import re
  5.  
  6.  
  7. class Keyboard:
  8. arrs = ['qwertyuiop', 'asdfghjkl', 'zxcvbnm', 'qaz', 'wsx', 'edc', 'rfv', 'tgb', 'yhn', 'ujm', 'ik', 'ol',
  9. 'йцукенгшщзхъ', 'фывапролджэ', 'ячсмитьбю', 'йфя', 'цыч', 'увс', 'кам', 'епи', 'нрт', 'гоь', 'шлб', 'щд', 'хэ']
  10.  
  11. def __init__(self):
  12. self.dict = defaultdict(str)
  13. for word in self.arrs:
  14. for ln in range(1, len(word)):
  15. self.dict[word[ln]] += word[ln-1]
  16. self.dict[word[ln-1]] += word[ln]
  17.  
  18. def close(self, a, b):
  19. if b in self.dict[a] or a in self.dict[b]:
  20. return True
  21. return False
  22.  
  23.  
  24. class Spellchecker:
  25.  
  26. def __init__(self, fname=None):
  27. if fname is None:
  28. self.dic = self.open_book('book1.txt')
  29. else:
  30. self.dic = self.open_book(fname)
  31.  
  32. def levenshtein(self, word1, word2, insert=1, delete=1, sub_close=1, sub_dist=2):
  33. table = [[j for j in range(len(word1) + 1)]] +\
  34. [[i + 1] + [None] * len(word1)
  35. for i in range(len(word2))]
  36.  
  37. on_keyboard = Keyboard()
  38.  
  39. for i in range(len(word2)):
  40. for j in range(len(word1)):
  41. if word1[j] == word2[i]:
  42. replacement = table[i][j]
  43. elif on_keyboard.close(word1[j], word2[i]):
  44. replacement = table[i][j] + sub_close
  45. else:
  46. replacement = table[i][j] + sub_dist
  47. insertion = table[i][j + 1] + insert
  48. removal = table[i + 1][j] + delete
  49. table[i + 1][j + 1] = min(replacement,
  50. insertion, removal)
  51. return table[len(word2)][len(word1)]
  52.  
  53. def open_book(self, fname):
  54. with open(fname, 'r', encoding='utf-8') as book:
  55. r = book.readlines()
  56. r = [word.strip('\r\n').lower() for word in r]
  57. dict = Counter(r)
  58. return dict
  59.  
  60. def check(self, word):
  61. if word in self.dic:
  62. return word
  63. d = defaultdict(list)
  64. for key in self.dic:
  65. d[self.levenshtein(word, key)].append(key)
  66. bestDif = d[sorted(d.keys())[0]]
  67. res = sorted([(w, self.dic[w]) for w in bestDif], key=lambda a: a[1], reverse=True)
  68. return res
  69.  
  70. def check_word(self, word):
  71. res = self.check(word)
  72. if isinstance(res, list):
  73. return ', '.join(i[0] for i in res)
  74. return res
  75.  
  76. def check_text(self, text):
  77. result = ''
  78. text = re.split(u'([ .,()–‒―—;!%$‱‰·»:<?>«*-\\[\\]^|{}+;\r\n])', text)
  79. for i in text:
  80. if i in u' .,()–‒―—;!%$‱‰·»:<?>«*-\\[\\]^|{}+;\r\n':
  81. result += i
  82. else:
  83. temp = self.check(i.lower())
  84. if isinstance(temp, list):
  85. if i[0].isupper():
  86. result += temp[0][0][0].upper() + temp[0][0][1:]
  87. else:
  88. result += temp[0][0]
  89. else:
  90. if i[0].isupper():
  91. result += temp[0].upper() + temp[1:]
  92. else:
  93. result += temp
  94. return result
  95.  
  96.  
  97. sp = Spellchecker()
  98. # check_word -
  99. # если нужно слово заменить, перечисляет все варианты замены и сколько раз они встретились в тексте,
  100. # а если не нужно заменять, просто возвращает само слово
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement