Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- k = 2 # Define the number of clusters in which we want to partion THE data
- # Define the proper notion of distance to deal with documents
- from sklearn.metrics.pairwise import cosine_similarity
- dist = 1 - cosine_similarity(X)
- # Run the algorithm KMeans
- model = KMeans(n_clusters = k)
- model.fit(X);
- print("Top terms per cluster:\n")
- order_centroids = model.cluster_centers_.argsort()[:, ::-1]
- terms = vectorizer.get_feature_names()
- for i in range(k):
- print ("Cluster %i:" % i, end='')
- for ind in order_centroids[i, :3]:
- print (' %s,' % terms[ind], end='')
- print ("")
- Top terms per cluster:
- Cluster 0: awesome, staff, cs50,
- Cluster 1: dog, cat, keeps,
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement