Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics.pairwise import cosine_similarity
- from sklearn.cluster import AgglomerativeClustering
- vectorizer = TfidfVectorizer(stop_words='english')
- X = vectorizer.fit_transform(data)
- C = 1 - cosine_similarity(X.T)
- ward = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(C)
- label = ward.labels_
- from sklearn.cluster import KMeans
- km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
- km.fit(X)
- order_centroids = km.cluster_centers_.argsort()[:, ::-1]
- terms = vectorizer.get_feature_names()
- for i in range(k):
- print("Cluster %d:" % i, end='')
- for ind in order_centroids[i, :10]:
- print(' %s' % terms[ind], end='')
- from sklearn.decomposition import NMF
- nmf = NMF(n_components=k, random_state=1).fit(X)
- feature_names = vectorizer.get_feature_names()
- for topic_idx, topic in enumerate(nmf.components_):
- print("Topic #%d:" % topic_idx)
- print(" ".join([feature_names[i]
- for i in topic.argsort()[:-10-1:-1]]))
- print()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement