Guest User

Untitled

a guest
Nov 17th, 2018
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.70 KB | None | 0 0
  1. from os import walk
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.feature_extraction.text import TfidfVectorizer
  5. import matplotlib.pyplot as plt
  6. from sklearn.decomposition import PCA
  7. from sklearn.cluster import KMeans
  8.  
  9. def plot_tfidf_classfeats_h(dfs):
  10. fig = plt.figure(figsize=(12, 9), facecolor="w")
  11. x = np.arange(len(dfs[0]))
  12. for i, df in enumerate(dfs):
  13. ax = fig.add_subplot(1, len(dfs), i+1)
  14. ax.spines["top"].set_visible(False)
  15. ax.spines["right"].set_visible(False)
  16. ax.set_frame_on(False)
  17. ax.get_xaxis().tick_bottom()
  18. ax.get_yaxis().tick_left()
  19. ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
  20. ax.set_title("cluster = " + str(df.label), fontsize=16)
  21. ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
  22. ax.barh(x, df.score, align='center', color='#7530FF')
  23. ax.set_yticks(x)
  24. ax.set_ylim([-1, x[-1]+1])
  25. yticks = ax.set_yticklabels(df.features)
  26. plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
  27. plt.show()
  28.  
  29. def readfiles(mypath):
  30. filenames = []
  31. for (dirpath, dirnames, filenames) in walk(mypath):
  32. filenames.extend(filenames)
  33. break
  34. return filenames
  35.  
  36. def parse_raw_message(raw_message):
  37. lines = raw_message.split('\n')
  38. email = {}
  39. message = ''
  40. keys_to_extract = ['from', 'to']
  41. for line in lines:
  42. if ':' not in line:
  43. message += line.strip()
  44. email['body'] = message
  45. else:
  46. pairs = line.split(':')
  47. key = pairs[0].lower()
  48. val = pairs[1].strip()
  49. if key in keys_to_extract:
  50. email[key] = val
  51. return email
  52.  
  53. def email_to_dataframe(filename):
  54. full_mail_contents = np.loadtxt(filename)
  55.  
  56. def parse_into_emails(messages):
  57. emails = [parse_raw_message(message) for message in messages]
  58. return {
  59. 'body': map_to_list(emails, 'body'),
  60. 'to': map_to_list(emails, 'to'),
  61. 'from_': map_to_list(emails, 'from')
  62. }
  63.  
  64. def parse_email(filename):
  65.  
  66. line_count = 0
  67. email = {}
  68. frm = ''
  69. sbj = ''
  70. org = ''
  71. message = []
  72. lines = 0
  73. message_line_number = -1
  74. flag =1
  75. with open(filename) as f:
  76. try:
  77. content = f.readlines()
  78. except:
  79. print('UTf encoding error')
  80. flag=-1
  81.  
  82. if flag!=-1:
  83. keys_to_extract = ['From', 'Subject', 'Organization', 'Lines']
  84. for line in content:
  85. if ':' not in line:
  86. message += line.strip()
  87. email['body'] = message
  88. else:
  89. pairs = line.split(':')
  90. key = pairs[0].lower()
  91. val = pairs[1].strip()
  92. if key in keys_to_extract:
  93. email[key] = val
  94. email['body'] = ''.join(email['body'])
  95. #email['from_'] = content[0].split(' ')[1]
  96. #email['sub'] = content[1][9:]
  97. #email['org'] = content[2][14:]
  98. #email['lines'] = int(content[3].split(' ')[1])
  99. #email['body'] = list(filter(('\n').__ne__, content[4:email['lines'] + 4]))
  100. #email['body'] = " ".join(email['body'])
  101. return email
  102. return -1
  103.  
  104. def top_tfidf_feats(row, features, top_n=20):
  105. topn_ids = np.argsort(row)[::-1][:top_n]
  106. top_feats = [(features[i], row[i]) for i in topn_ids]
  107. df = pd.DataFrame(top_feats, columns=['features', 'score'])
  108. return df
  109. def top_feats_in_doc(X, features, row_id, top_n=25):
  110. row = np.squeeze(X[row_id].toarray())
  111. return top_tfidf_feats(row, features, top_n)
  112.  
  113. def top_mean_feats(X, features,
  114. grp_ids=None, min_tfidf=0.1, top_n=25):
  115. if grp_ids:
  116. D = X[grp_ids].toarray()
  117. else:
  118. D = X.toarray()
  119. D[D < min_tfidf] = 0
  120. tfidf_means = np.mean(D, axis=0)
  121. return top_tfidf_feats(tfidf_means, features, top_n)
  122.  
  123. def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
  124. dfs = []
  125. labels = np.unique(y)
  126. for label in labels:
  127. ids = np.where(y==label)
  128. feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
  129. feats_df.label = label
  130. dfs.append(feats_df)
  131. return dfs
  132.  
  133. filenames = readfiles("tocluster/")
  134. mails = []
  135. for i in range(len(filenames)):
  136. parsed_mail = parse_email("tocluster/" + filenames[i])
  137. if parsed_mail != -1:
  138. mails.append(parsed_mail)
  139.  
  140. email_df = pd.DataFrame(mails)
  141. vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)
  142. X = vect.fit_transform(email_df.body)
  143. X_dense = X.todense()
  144. coords = PCA(n_components=2).fit_transform(X_dense)
  145. features = vect.get_feature_names()
  146.  
  147. n_clusters = 8
  148. clf = KMeans(n_clusters=n_clusters, max_iter=1000, init='k-means++', n_init=1, random_state=0)
  149. labels = clf.fit_predict(X)
  150.  
  151. print(top_mean_feats(X, features, top_n=10))
  152. # Let's plot this with matplotlib to visualize it.
  153. # First we need to make 2D coordinates from the sparse matrix.
  154. X_dense = X.todense()
  155. pca = PCA(n_components=2).fit(X_dense)
  156. coords = pca.transform(X_dense)
  157.  
  158. #plt.scatter(coords[:, 0], coords[:, 1], c='m')
  159. # Lets plot it again, but this time we add some color to it.
  160. # This array needs to be at least the length of the n_clusters.
  161. label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC",
  162. "#D2CA0D", "#522A64", "#A3DB05", "#FC6514",
  163. "#C1AE9F", "#D3A588"]
  164. colors = [label_colors[i] for i in labels]
  165.  
  166. plt.scatter(coords[:, 0], coords[:, 1], c=colors)
  167. # Plot the cluster centers
  168. centroids = clf.cluster_centers_
  169. centroid_coords = pca.transform(centroids)
  170.  
  171. plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c='#444d60')
  172. plt.show()
  173.  
  174. #Use this to print the top terms per cluster with matplotlib.
  175. plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 25))
Add Comment
Please, Sign In to add comment