Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def splitter(orig_word, dict_words):
- for i in range(len(orig_word)):
- l = orig_word[:i]
- r = orig_word[i:]
- if (l in dict_words) & (r in dict_words):
- return [l, r]
- return 0
- def word_forms(words_in_text, dict_words):
- forms = []
- count = 0
- print('\nСловоформы: ', len(words_in_text), '\n')
- for i in words_in_text:
- if i not in forms:
- forms.append(i)
- for i in forms:
- if i in dict_words:
- count += 1
- print('Различные словоформы: ', len(forms), '\n')
- print('Количество словоформ в словаре:', count, '\n')
- return len(forms)
- def punctuation_remover(orig_str):
- replaced_str = orig_str.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace("(", "")\
- .replace(")", "").replace("»", "").replace("«", "").replace(";", "").replace(":", "")
- return replaced_str
- def editorial_distance(w1, w2):
- n, m = len(w1), len(w2)
- if n > m:
- w1, w2 = w2, w1
- n, m = m, n
- cur = range(n + 1)
- for i in range(1, m + 1):
- prev, cur = cur, [i] + [0] * n
- for j in range(1, n + 1):
- add = prev[j] + 1
- delete = cur[j - 1] + 1
- change = prev[j - 1]
- if w1[j - 1] != w2[i - 1]:
- change += 1
- cur[j] = min(add, delete, change)
- return cur[n]
- dict_filename = '/Users/gadzh/OneDrive/Рабочий стол/цк, эвм/цк/dict1.txt'
- text_filename = "/Users/gadzh/OneDrive/Рабочий стол/цк, эвм/цк/brain1102.txt"
- with open(text_filename, encoding = "utf-8") as original_text:
- orig_words = punctuation_remover(original_text.read().lower()).split()
- print(orig_words)
- dict = {}
- for line in open(dict_filename, "r"):
- data = line.split()
- dict[data[0]] = data[1]
- keys = list(dict.keys())
- word_forms(orig_words, keys)
- errors = []
- for word in orig_words:
- if word not in keys:
- errors.append(word)
- print(errors, '\nКоличество ошибок в тексте: ', len(errors))
- corrections = []
- split_corrections = []
- for error in errors:
- min_distance = 3
- correct = {}
- splitted = splitter(error, keys)
- quantity = 0
- if splitted != 0:
- correct[error] = splitted
- split_corrections.append(correct)
- print(correct, "Расстояние: ", 1)
- continue
- for right_word in keys:
- if editorial_distance(error, right_word) < min_distance:
- correct[error] = right_word
- min_distance = editorial_distance(error, right_word)
- quantity = int(dict[right_word])
- else:
- if (editorial_distance(error, right_word) == min_distance) & (quantity > int(dict[right_word])):
- correct[error] = right_word
- min_distance = editorial_distance(error, right_word)
- quantity = int(dict[right_word])
- if min_distance < 3:
- print(correct, '; distance:', min_distance)
- corrections.append(correct)
- corrections_list = []
- for i in corrections:
- corrections_list.append(list(i.keys())[0])
- for i in split_corrections:
- corrections_list.append(list(i.keys())[0])
- for i in corrections_list:
- errors.remove(i)
- if errors:
- print(errors)
- for i in errors:
- min_distance = 1000
- for j in keys:
- if editorial_distance(i, j) < min_distance:
- min_distance = editorial_distance(i, j)
- print('Слово не исправлено: ', i, ", расстояние: ", min_distance)
- with open(text_filename, "r") as orig_text:
- text = orig_text.read()
- for i in corrections:
- if list(i.keys())[0] in text:
- text = text.replace(list(i.keys())[0], i[list(i.keys())[0]])
- if i[list(i.keys())[0]].replace(list(i.keys())[0], i[list(i.keys())[0]]) in text:
- text = text.replace(i[list(i.keys())[0]].replace(list(i.keys())[0], i[list(i.keys())[0]]), i[list(i.keys())[0]])
- for i in split_corrections:
- punctuation = ".,();:»«!?"
- pattern = i[list(i.keys())[0]][0] + i[list(i.keys())[0]][1]
- if pattern in text:
- text = text.replace(pattern, i[list(i.keys())[0]][0] + " " + i[list(i.keys())[0]][1])
- for char in punctuation:
- pattern = i[list(i.keys())[0]][0] + char + i[list(i.keys())[0]][1]
- if pattern in text:
- text = text.replace(pattern, i[list(i.keys())[0]][0] + char + " " + i[list(i.keys())[0]][1])
- orig_words = punctuation_remover(text.lower()).split()
- word_forms(orig_words, keys)
- print(text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement