Untitled

from os import walk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

def plot_tfidf_classfeats_h(dfs):
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("cluster = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.score, align='center', color='#7530FF')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.features)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

def readfiles(mypath):
    filenames = []
    for (dirpath, dirnames, filenames) in walk(mypath):
        filenames.extend(filenames)
        break
    return filenames

def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

def email_to_dataframe(filename):
    full_mail_contents = np.loadtxt(filename)

def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

def parse_email(filename):

    line_count = 0
    email = {}
    frm = ''
    sbj = ''
    org = ''
    message = []
    lines = 0
    message_line_number = -1
    flag =1
    with open(filename) as f:
        try:
            content = f.readlines()
        except:
            print('UTf encoding error')
            flag=-1

    if flag!=-1:
        keys_to_extract = ['From', 'Subject', 'Organization', 'Lines']
        for line in content:
            if ':' not in line:
                message += line.strip()
                email['body'] = message
            else:
                pairs = line.split(':')
                key = pairs[0].lower()
                val = pairs[1].strip()
                if key in keys_to_extract:
                    email[key] = val
        email['body'] = ''.join(email['body'])
        #email['from_'] = content[0].split(' ')[1]
        #email['sub'] = content[1][9:]
        #email['org'] = content[2][14:]
        #email['lines'] = int(content[3].split(' ')[1])
        #email['body'] = list(filter(('\n').__ne__, content[4:email['lines'] + 4]))
        #email['body'] = " ".join(email['body'])
        return email
    return -1

def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df
def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(X, features,
    grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(X, features, ids,    min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

filenames = readfiles("tocluster/")
mails = []
for i in range(len(filenames)):
    parsed_mail = parse_email("tocluster/" + filenames[i])
    if parsed_mail != -1:
        mails.append(parsed_mail)

email_df = pd.DataFrame(mails)
vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)
X = vect.fit_transform(email_df.body)
X_dense = X.todense()
coords = PCA(n_components=2).fit_transform(X_dense)
features = vect.get_feature_names()

n_clusters = 8
clf = KMeans(n_clusters=n_clusters, max_iter=1000, init='k-means++', n_init=1, random_state=0)
labels = clf.fit_predict(X)

print(top_mean_feats(X, features, top_n=10))
# Let's plot this with matplotlib to visualize it.
# First we need to make 2D coordinates from the sparse matrix.
X_dense = X.todense()
pca = PCA(n_components=2).fit(X_dense)
coords = pca.transform(X_dense)

#plt.scatter(coords[:, 0], coords[:, 1], c='m')
# Lets plot it again, but this time we add some color to it.
# This array needs to be at least the length of the n_clusters.
label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC",
                "#D2CA0D", "#522A64", "#A3DB05", "#FC6514",
                "#C1AE9F", "#D3A588"]
colors = [label_colors[i] for i in labels]

plt.scatter(coords[:, 0], coords[:, 1], c=colors)
# Plot the cluster centers
centroids = clf.cluster_centers_
centroid_coords = pca.transform(centroids)

plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c='#444d60')
plt.show()

#Use this to print the top terms per cluster with matplotlib.
plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 25))