SHARE
TWEET

Untitled

a guest Dec 8th, 2019 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import numpy as np
  2.  
  3. def levenshtein(seq1, seq2):
  4.     size_x = len(seq1) + 1
  5.     size_y = len(seq2) + 1
  6.     matrix = np.zeros ((size_x, size_y))
  7.     for x in range(size_x):
  8.         matrix [x, 0] = x
  9.     for y in range(size_y):
  10.         matrix [0, y] = y
  11.  
  12.     for x in range(1, size_x):
  13.         for y in range(1, size_y):
  14.             if seq1[x-1] == seq2[y-1]:
  15.                 matrix [x,y] = min(
  16.                     matrix[x-1, y] + 1,
  17.                     matrix[x-1, y-1],
  18.                     matrix[x, y-1] + 1
  19.                 )
  20.             else:
  21.                 matrix [x,y] = min(
  22.                     matrix[x-1,y] + 1,
  23.                     matrix[x-1,y-1] + 1,
  24.                     matrix[x,y-1] + 1
  25.                 )
  26.     return (matrix[size_x - 1, size_y - 1])
  27.  
  28.  
  29. print(levenshtein("автомобиль", "автобиль"))
  30. with open('data.txt', 'r') as file:
  31.     data = file.read().replace('\n', '')
  32.  
  33. for i in "! ? , ; . : « ( ) »".split(' '):
  34.     data = data.replace(i, '')
  35.  
  36. data = data.lower()
  37.  
  38. print("Словоформ в тексте " + str(len(data.split(' '))))
  39.  
  40. word_forms = []
  41.  
  42. for word in data.split(' '):
  43.     if not word in word_forms:
  44.         word_forms.append(word)
  45.  
  46. print("Разных словоформ в тексте "+ str(len(word_forms)))
  47.  
  48. dic = {}
  49. with open('dict.txt', 'r') as file:
  50.     dic_strs = file.readlines()
  51. for s in dic_strs:
  52.     word, count = s.replace('\n', '').split(' ')
  53.     dic.update({word: int(count)})
  54.  
  55. c = 0
  56. not_in_dic = []
  57.  
  58. not_in_dict_counter = {}
  59. for word in word_forms:
  60.     if word in dic.keys():
  61.         c += 1
  62.     else:
  63.         if word not in not_in_dic:
  64.  
  65.             not_in_dict_counter.update({word: 0})
  66.             not_in_dic.append(word)
  67.         else:
  68.             not_in_dict_counter.update({word: not_in_dict_counter[word]+1})
  69. print("Разных словоформ из текста в словаре " + str(c))
  70.  
  71.  
  72.  
  73.  
  74. def mistakes(w1):
  75.    
  76.     min_mist = len(w1)
  77.     w2=""
  78.     for word2 in dic.keys():
  79.         lv = levenshtein(w1, word2)
  80.         if(min_mist == lv):
  81.             if (w2 and (dic[word2] > dic[w2])):
  82.                 w2 = word2
  83.         if(min_mist > lv):
  84.             min_mist = lv
  85.             w2 = word2
  86.        
  87.     space_inserted = ""
  88.    
  89.     for i in range(1,len(w1)):
  90.         w11 = w1[:i]
  91.         w12 = w1[i:]
  92.         if((w11 in dic.keys()) and (w12 in dic.keys())):
  93.             new_w = w11+" "+w12
  94.             print(f"{w1} - {new_w} - {1}")
  95.             return new_w
  96.    
  97.     if(min_mist<=2):
  98.         print(f"{w1} - {w2} - {min_mist}")
  99.         return w2
  100.     else:
  101.         print(f"{w1} - не найдено - >2")
  102.         return w1
  103.    
  104. l = list(not_in_dict_counter.items())
  105. l.sort(key=lambda i: i[1])
  106. for i in l:
  107.     word = i[0]
  108.     new_w = mistakes(word)
  109.  
  110.    
  111.     if new_w != word:
  112.         data = data.replace(word, new_w)
  113.  
  114.  
  115. print("Словоформ в тексте " + str(len(data.split(' '))))
  116.  
  117. word_forms = []
  118.  
  119. for word in data.split(' '):
  120.     if not word in word_forms:
  121.         word_forms.append(word)
  122.  
  123. print("Разных словоформ в тексте "+ str(len(word_forms)))
  124.  
  125. dic = {}
  126. with open('dict.txt', 'r') as file:
  127.     dic_strs = file.readlines()
  128. for s in dic_strs:
  129.     word, count = s.replace('\n', '').split(' ')
  130.     dic.update({word: int(count)})
  131.  
  132. c = 0
  133. not_in_dic = []
  134. for word in word_forms:
  135.     if word in dic.keys():
  136.         c += 1
  137.     else:
  138.         not_in_dic.append(word)
  139. print("Разных словоформ из текста в словаре " + str(c))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top