Advertisement
Guest User

Untitled

a guest
Apr 10th, 2020
233
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.51 KB | None | 0 0
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. from pymorphy2 import MorphAnalyzer
  3. import numpy as np
  4. import pandas as pd
  5.  
  6. def finder(corpus, text, number=3):
  7.     def cosine_sim(v1, v2):
  8.         v1 = np.array(v1)
  9.         v2 = np.array(v2)
  10.         res = v1 @ v2
  11.         v1_norm = np.sqrt(np.sum([i**2 for i in v1]))
  12.         v2_norm = np.sqrt(np.sum([i**2 for i in v2]))
  13.         return res / (v1_norm * v2_norm)
  14.    
  15.     vect = CountVectorizer()
  16.     corpus_t = vect.fit_transform(corpus).toarray()
  17.     text_t = vect.transform([text]).toarray()[0]
  18.    
  19.     percents = []
  20.     for i in range(len(corpus)):
  21.         percents.append(cosine_sim(corpus_t[i], text_t))
  22.        
  23.     sort = sorted(percents, reverse=True)
  24.     index = []
  25.    
  26.     for i in range(number):
  27.         if i == 0:
  28.             index.append(percents.index(sort[i+1]))
  29.         else:            
  30.             for j in range(len(index)):
  31.                 if percents.index(sort[i+1]) == index[j]:
  32.                     ind = percents.index(sort[i+1])
  33.                     percents.remove(percents[ind])
  34.                     index.append(percents.index(sort[i+1]))
  35.                     break
  36.                 else:
  37.                     index.append(percents.index(sort[i+1]))
  38.                     break
  39.  
  40.     res = [corpus[index[i]] for i in range(number)]
  41.        
  42.     return res
  43.    
  44.    
  45. work = finder(corpus[0:10], corpus[1])
  46. print('Наш текст:\n' + corpus[1] + '\nПохожие')
  47. for i in range(len(work)):
  48.     print(work[i])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement