﻿

# Untitled

a guest
Apr 10th, 2020
176
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. from sklearn.feature_extraction.text import CountVectorizer
2. from pymorphy2 import MorphAnalyzer
3. import numpy as np
4. import pandas as pd
5.
6. def finder(corpus, text, number=3):
7.     def cosine_sim(v1, v2):
8.         v1 = np.array(v1)
9.         v2 = np.array(v2)
10.         res = v1 @ v2
11.         v1_norm = np.sqrt(np.sum([i**2 for i in v1]))
12.         v2_norm = np.sqrt(np.sum([i**2 for i in v2]))
13.         return res / (v1_norm * v2_norm)
14.
15.     vect = CountVectorizer()
16.     corpus_t = vect.fit_transform(corpus).toarray()
17.     text_t = vect.transform([text]).toarray()[0]
18.
19.     percents = []
20.     for i in range(len(corpus)):
21.         percents.append(cosine_sim(corpus_t[i], text_t))
22.
23.     sort = sorted(percents, reverse=True)
24.     index = []
25.
26.     for i in range(number):
27.         if i == 0:
28.             index.append(percents.index(sort[i+1]))
29.         else:
30.             for j in range(len(index)):
31.                 if percents.index(sort[i+1]) == index[j]:
32.                     ind = percents.index(sort[i+1])
33.                     percents.remove(percents[ind])
34.                     index.append(percents.index(sort[i+1]))
35.                     break
36.                 else:
37.                     index.append(percents.index(sort[i+1]))
38.                     break
39.
40.     res = [corpus[index[i]] for i in range(number)]
41.
42.     return res
43.
44.
45. work = finder(corpus[0:10], corpus[1])
46. print('Наш текст:\n' + corpus[1] + '\nПохожие')
47. for i in range(len(work)):
48.     print(work[i])
RAW Paste Data