• API
• FAQ
• Tools
• Archive
SHARE
TWEET

# Untitled

a guest Dec 8th, 2019 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import numpy as np
2.
3. def levenshtein(seq1, seq2):
4.     size_x = len(seq1) + 1
5.     size_y = len(seq2) + 1
6.     matrix = np.zeros ((size_x, size_y))
7.     for x in range(size_x):
8.         matrix [x, 0] = x
9.     for y in range(size_y):
10.         matrix [0, y] = y
11.
12.     for x in range(1, size_x):
13.         for y in range(1, size_y):
14.             if seq1[x-1] == seq2[y-1]:
15.                 matrix [x,y] = min(
16.                     matrix[x-1, y] + 1,
17.                     matrix[x-1, y-1],
18.                     matrix[x, y-1] + 1
19.                 )
20.             else:
21.                 matrix [x,y] = min(
22.                     matrix[x-1,y] + 1,
23.                     matrix[x-1,y-1] + 1,
24.                     matrix[x,y-1] + 1
25.                 )
26.     return (matrix[size_x - 1, size_y - 1])
27.
28.
29. print(levenshtein("автомобиль", "автобиль"))
30. with open('data.txt', 'r') as file:
32.
33. for i in "! ? , ; . : « ( ) »".split(' '):
34.     data = data.replace(i, '')
35.
36. data = data.lower()
37.
38. print("Словоформ в тексте " + str(len(data.split(' '))))
39.
40. word_forms = []
41.
42. for word in data.split(' '):
43.     if not word in word_forms:
44.         word_forms.append(word)
45.
46. print("Разных словоформ в тексте "+ str(len(word_forms)))
47.
48. dic = {}
49. with open('dict.txt', 'r') as file:
51. for s in dic_strs:
52.     word, count = s.replace('\n', '').split(' ')
53.     dic.update({word: int(count)})
54.
55. c = 0
56. not_in_dic = []
57.
58. not_in_dict_counter = {}
59. for word in word_forms:
60.     if word in dic.keys():
61.         c += 1
62.     else:
63.         if word not in not_in_dic:
64.
65.             not_in_dict_counter.update({word: 0})
66.             not_in_dic.append(word)
67.         else:
68.             not_in_dict_counter.update({word: not_in_dict_counter[word]+1})
69. print("Разных словоформ из текста в словаре " + str(c))
70.
71.
72.
73.
74. def mistakes(w1):
75.
76.     min_mist = len(w1)
77.     w2=""
78.     for word2 in dic.keys():
79.         lv = levenshtein(w1, word2)
80.         if(min_mist == lv):
81.             if (w2 and (dic[word2] > dic[w2])):
82.                 w2 = word2
83.         if(min_mist > lv):
84.             min_mist = lv
85.             w2 = word2
86.
87.     space_inserted = ""
88.
89.     for i in range(1,len(w1)):
90.         w11 = w1[:i]
91.         w12 = w1[i:]
92.         if((w11 in dic.keys()) and (w12 in dic.keys())):
93.             new_w = w11+" "+w12
94.             print(f"{w1} - {new_w} - {1}")
95.             return new_w
96.
97.     if(min_mist<=2):
98.         print(f"{w1} - {w2} - {min_mist}")
99.         return w2
100.     else:
101.         print(f"{w1} - не найдено - >2")
102.         return w1
103.
104. l = list(not_in_dict_counter.items())
105. l.sort(key=lambda i: i[1])
106. for i in l:
107.     word = i[0]
108.     new_w = mistakes(word)
109.
110.
111.     if new_w != word:
112.         data = data.replace(word, new_w)
113.
114.
115. print("Словоформ в тексте " + str(len(data.split(' '))))
116.
117. word_forms = []
118.
119. for word in data.split(' '):
120.     if not word in word_forms:
121.         word_forms.append(word)
122.
123. print("Разных словоформ в тексте "+ str(len(word_forms)))
124.
125. dic = {}
126. with open('dict.txt', 'r') as file:
128. for s in dic_strs:
129.     word, count = s.replace('\n', '').split(' ')
130.     dic.update({word: int(count)})
131.
132. c = 0
133. not_in_dic = []
134. for word in word_forms:
135.     if word in dic.keys():
136.         c += 1
137.     else:
138.         not_in_dic.append(word)
139. print("Разных словоформ из текста в словаре " + str(c))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.

Top