Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import CountVectorizer
- from pymorphy2 import MorphAnalyzer
- import numpy as np
- import pandas as pd
- def finder(corpus, text, number=3):
- def cosine_sim(v1, v2):
- v1 = np.array(v1)
- v2 = np.array(v2)
- res = v1 @ v2
- v1_norm = np.sqrt(np.sum([i**2 for i in v1]))
- v2_norm = np.sqrt(np.sum([i**2 for i in v2]))
- return res / (v1_norm * v2_norm)
- vect = CountVectorizer()
- corpus_t = vect.fit_transform(corpus).toarray()
- text_t = vect.transform([text]).toarray()[0]
- percents = []
- for i in range(len(corpus)):
- percents.append(cosine_sim(corpus_t[i], text_t))
- sort = sorted(percents, reverse=True)
- index = []
- for i in range(number):
- if i == 0:
- index.append(percents.index(sort[i+1]))
- else:
- for j in range(len(index)):
- if percents.index(sort[i+1]) == index[j]:
- ind = percents.index(sort[i+1])
- percents.remove(percents[ind])
- index.append(percents.index(sort[i+1]))
- break
- else:
- index.append(percents.index(sort[i+1]))
- break
- res = [corpus[index[i]] for i in range(number)]
- return res
- work = finder(corpus[0:10], corpus[1])
- print('Наш текст:\n' + corpus[1] + '\nПохожие')
- for i in range(len(work)):
- print(work[i])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement