Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Loop over each synopsis and append its content to a list of string named 'corpus'
- corpus = []
- for i in range(0, frame["Synopsis"].size):
- corpus.append(frame["Synopsis"][i])
- # Create tf–idf matrix
- from sklearn.feature_extraction.text import TfidfVectorizer
- vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 0.2)
- # min_df = 0.2 means that the term must be in at least 20% of the documents
- X = vectorizer.fit_transform(corpus)
- k = 2 # Define the number of clusters in which we want to partion our data
- # Define the proper notion of distance to deal with documents
- from sklearn.metrics.pairwise import cosine_similarity
- dist = 1 - cosine_similarity(X)
- # Run the algorithm kmeans
- model = KMeans(n_clusters = k)
- model.fit(X);
- no_words = 4 # Number of words to print per cluster
- order_centroids = model.cluster_centers_.argsort()[:, ::-1] # Sort cluster centers by proximity to centroid
- terms = vectorizer.get_feature_names()
- labels = model.labels_ # Get labels assigned to each data
- print("Top terms per cluster:\n")
- for i in range(k):
- print("Cluster %d movies:" % i, end='')
- for title in frame["Title"][labels == i]:
- print(' %s,' % title, end='')
- print() #add a whitespace
- print("Cluster %d words:" % i, end='')
- for ind in order_centroids[i, :no_words]:
- print (' %s' % terms[ind], end=','),
- print()
- print()
- Top terms per cluster:
- Cluster 0 movies: Mad Max: Fury Road, The Matrix, No Country for Old Men, A Beautiful Mind, Inception, Frozen, Finding Nemo, Toy Story,
- Cluster 0 words: room, tank, says, joe,
- Cluster 1 movies: The King's Speech, The Lion King, Aladdin, Cinderella, Robin Hood,
- Cluster 1 words: king, prince, john, palace,
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement