Advertisement
Guest User

Untitled

a guest
Dec 10th, 2019
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.61 KB | None | 0 0
  1. def splitter(orig_word, dict_words):
  2. for i in range(len(orig_word)):
  3. l = orig_word[:i]
  4. r = orig_word[i:]
  5. if (l in dict_words) & (r in dict_words):
  6. return [l, r]
  7. return 0
  8.  
  9.  
  10. def word_forms(words_in_text, dict_words):
  11. forms = []
  12. count = 0
  13. print('\nСловоформы: ', len(words_in_text), '\n')
  14. for i in words_in_text:
  15. if i not in forms:
  16. forms.append(i)
  17. for i in forms:
  18. if i in dict_words:
  19. count += 1
  20. print('Различные словоформы: ', len(forms), '\n')
  21. print('Количество словоформ в словаре:', count, '\n')
  22. return len(forms)
  23.  
  24.  
  25. def punctuation_remover(orig_str):
  26. replaced_str = orig_str.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace("(", "")\
  27. .replace(")", "").replace("»", "").replace("«", "").replace(";", "").replace(":", "")
  28. return replaced_str
  29.  
  30.  
  31. def editorial_distance(w1, w2):
  32. n, m = len(w1), len(w2)
  33. if n > m:
  34. w1, w2 = w2, w1
  35. n, m = m, n
  36. cur = range(n + 1)
  37. for i in range(1, m + 1):
  38. prev, cur = cur, [i] + [0] * n
  39. for j in range(1, n + 1):
  40. add = prev[j] + 1
  41. delete = cur[j - 1] + 1
  42. change = prev[j - 1]
  43. if w1[j - 1] != w2[i - 1]:
  44. change += 1
  45. cur[j] = min(add, delete, change)
  46.  
  47. return cur[n]
  48.  
  49.  
  50. dict_filename = '/Users/gadzh/OneDrive/Рабочий стол/цк, эвм/цк/dict1.txt'
  51. text_filename = "/Users/gadzh/OneDrive/Рабочий стол/цк, эвм/цк/brain1102.txt"
  52. with open(text_filename, encoding = "utf-8") as original_text:
  53. orig_words = punctuation_remover(original_text.read().lower()).split()
  54. print(orig_words)
  55.  
  56. dict = {}
  57.  
  58. for line in open(dict_filename, "r"):
  59. data = line.split()
  60. dict[data[0]] = data[1]
  61. keys = list(dict.keys())
  62. word_forms(orig_words, keys)
  63. errors = []
  64. for word in orig_words:
  65. if word not in keys:
  66. errors.append(word)
  67.  
  68. print(errors, '\nКоличество ошибок в тексте: ', len(errors))
  69. corrections = []
  70. split_corrections = []
  71. for error in errors:
  72. min_distance = 3
  73. correct = {}
  74. splitted = splitter(error, keys)
  75. quantity = 0
  76. if splitted != 0:
  77. correct[error] = splitted
  78. split_corrections.append(correct)
  79. print(correct, "Расстояние: ", 1)
  80. continue
  81. for right_word in keys:
  82. if editorial_distance(error, right_word) < min_distance:
  83. correct[error] = right_word
  84. min_distance = editorial_distance(error, right_word)
  85. quantity = int(dict[right_word])
  86. else:
  87. if (editorial_distance(error, right_word) == min_distance) & (quantity > int(dict[right_word])):
  88. correct[error] = right_word
  89. min_distance = editorial_distance(error, right_word)
  90. quantity = int(dict[right_word])
  91.  
  92. if min_distance < 3:
  93. print(correct, '; distance:', min_distance)
  94. corrections.append(correct)
  95. corrections_list = []
  96. for i in corrections:
  97. corrections_list.append(list(i.keys())[0])
  98. for i in split_corrections:
  99. corrections_list.append(list(i.keys())[0])
  100. for i in corrections_list:
  101. errors.remove(i)
  102.  
  103. if errors:
  104. print(errors)
  105. for i in errors:
  106. min_distance = 1000
  107. for j in keys:
  108. if editorial_distance(i, j) < min_distance:
  109. min_distance = editorial_distance(i, j)
  110. print('Слово не исправлено: ', i, ", расстояние: ", min_distance)
  111.  
  112.  
  113. with open(text_filename, "r") as orig_text:
  114. text = orig_text.read()
  115. for i in corrections:
  116. if list(i.keys())[0] in text:
  117. text = text.replace(list(i.keys())[0], i[list(i.keys())[0]])
  118. if i[list(i.keys())[0]].replace(list(i.keys())[0], i[list(i.keys())[0]]) in text:
  119. text = text.replace(i[list(i.keys())[0]].replace(list(i.keys())[0], i[list(i.keys())[0]]), i[list(i.keys())[0]])
  120.  
  121. for i in split_corrections:
  122. punctuation = ".,();:»«!?"
  123. pattern = i[list(i.keys())[0]][0] + i[list(i.keys())[0]][1]
  124. if pattern in text:
  125. text = text.replace(pattern, i[list(i.keys())[0]][0] + " " + i[list(i.keys())[0]][1])
  126. for char in punctuation:
  127. pattern = i[list(i.keys())[0]][0] + char + i[list(i.keys())[0]][1]
  128. if pattern in text:
  129. text = text.replace(pattern, i[list(i.keys())[0]][0] + char + " " + i[list(i.keys())[0]][1])
  130.  
  131. orig_words = punctuation_remover(text.lower()).split()
  132. word_forms(orig_words, keys)
  133. print(text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement