Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from os import walk
- import numpy as np
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- import matplotlib.pyplot as plt
- from sklearn.decomposition import PCA
- from sklearn.cluster import KMeans
- def plot_tfidf_classfeats_h(dfs):
- fig = plt.figure(figsize=(12, 9), facecolor="w")
- x = np.arange(len(dfs[0]))
- for i, df in enumerate(dfs):
- ax = fig.add_subplot(1, len(dfs), i+1)
- ax.spines["top"].set_visible(False)
- ax.spines["right"].set_visible(False)
- ax.set_frame_on(False)
- ax.get_xaxis().tick_bottom()
- ax.get_yaxis().tick_left()
- ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
- ax.set_title("cluster = " + str(df.label), fontsize=16)
- ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
- ax.barh(x, df.score, align='center', color='#7530FF')
- ax.set_yticks(x)
- ax.set_ylim([-1, x[-1]+1])
- yticks = ax.set_yticklabels(df.features)
- plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
- plt.show()
- def readfiles(mypath):
- filenames = []
- for (dirpath, dirnames, filenames) in walk(mypath):
- filenames.extend(filenames)
- break
- return filenames
- def parse_raw_message(raw_message):
- lines = raw_message.split('\n')
- email = {}
- message = ''
- keys_to_extract = ['from', 'to']
- for line in lines:
- if ':' not in line:
- message += line.strip()
- email['body'] = message
- else:
- pairs = line.split(':')
- key = pairs[0].lower()
- val = pairs[1].strip()
- if key in keys_to_extract:
- email[key] = val
- return email
- def email_to_dataframe(filename):
- full_mail_contents = np.loadtxt(filename)
- def parse_into_emails(messages):
- emails = [parse_raw_message(message) for message in messages]
- return {
- 'body': map_to_list(emails, 'body'),
- 'to': map_to_list(emails, 'to'),
- 'from_': map_to_list(emails, 'from')
- }
- def parse_email(filename):
- line_count = 0
- email = {}
- frm = ''
- sbj = ''
- org = ''
- message = []
- lines = 0
- message_line_number = -1
- flag =1
- with open(filename) as f:
- try:
- content = f.readlines()
- except:
- print('UTf encoding error')
- flag=-1
- if flag!=-1:
- keys_to_extract = ['From', 'Subject', 'Organization', 'Lines']
- for line in content:
- if ':' not in line:
- message += line.strip()
- email['body'] = message
- else:
- pairs = line.split(':')
- key = pairs[0].lower()
- val = pairs[1].strip()
- if key in keys_to_extract:
- email[key] = val
- email['body'] = ''.join(email['body'])
- #email['from_'] = content[0].split(' ')[1]
- #email['sub'] = content[1][9:]
- #email['org'] = content[2][14:]
- #email['lines'] = int(content[3].split(' ')[1])
- #email['body'] = list(filter(('\n').__ne__, content[4:email['lines'] + 4]))
- #email['body'] = " ".join(email['body'])
- return email
- return -1
- def top_tfidf_feats(row, features, top_n=20):
- topn_ids = np.argsort(row)[::-1][:top_n]
- top_feats = [(features[i], row[i]) for i in topn_ids]
- df = pd.DataFrame(top_feats, columns=['features', 'score'])
- return df
- def top_feats_in_doc(X, features, row_id, top_n=25):
- row = np.squeeze(X[row_id].toarray())
- return top_tfidf_feats(row, features, top_n)
- def top_mean_feats(X, features,
- grp_ids=None, min_tfidf=0.1, top_n=25):
- if grp_ids:
- D = X[grp_ids].toarray()
- else:
- D = X.toarray()
- D[D < min_tfidf] = 0
- tfidf_means = np.mean(D, axis=0)
- return top_tfidf_feats(tfidf_means, features, top_n)
- def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
- dfs = []
- labels = np.unique(y)
- for label in labels:
- ids = np.where(y==label)
- feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
- feats_df.label = label
- dfs.append(feats_df)
- return dfs
- filenames = readfiles("tocluster/")
- mails = []
- for i in range(len(filenames)):
- parsed_mail = parse_email("tocluster/" + filenames[i])
- if parsed_mail != -1:
- mails.append(parsed_mail)
- email_df = pd.DataFrame(mails)
- vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)
- X = vect.fit_transform(email_df.body)
- X_dense = X.todense()
- coords = PCA(n_components=2).fit_transform(X_dense)
- features = vect.get_feature_names()
- n_clusters = 8
- clf = KMeans(n_clusters=n_clusters, max_iter=1000, init='k-means++', n_init=1, random_state=0)
- labels = clf.fit_predict(X)
- print(top_mean_feats(X, features, top_n=10))
- # Let's plot this with matplotlib to visualize it.
- # First we need to make 2D coordinates from the sparse matrix.
- X_dense = X.todense()
- pca = PCA(n_components=2).fit(X_dense)
- coords = pca.transform(X_dense)
- #plt.scatter(coords[:, 0], coords[:, 1], c='m')
- # Lets plot it again, but this time we add some color to it.
- # This array needs to be at least the length of the n_clusters.
- label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC",
- "#D2CA0D", "#522A64", "#A3DB05", "#FC6514",
- "#C1AE9F", "#D3A588"]
- colors = [label_colors[i] for i in labels]
- plt.scatter(coords[:, 0], coords[:, 1], c=colors)
- # Plot the cluster centers
- centroids = clf.cluster_centers_
- centroid_coords = pca.transform(centroids)
- plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c='#444d60')
- plt.show()
- #Use this to print the top terms per cluster with matplotlib.
- plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 25))
Add Comment
Please, Sign In to add comment