Untitled

from sklearn.svm import SVC
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from itertools import izip
from sklearn.metrics import classification_report as evaluate, accuracy_score
from create_sentiment_featuresets import create_feature_sets_and_labels

# File Sources
pos = './data/pos.txt'
neg = './data/neg.txt'

def train_model(model, train_x, train_y):
    model.fit(train_x, train_y)
    return model


def evaluate_model(model, test_x, test_y):
    predict_y = model.predict(test_x)
    classification_report = evaluate(test_y, predict_y, target_names=target_names, output_dict=True)
    accuracy = accuracy_score(test_y, predict_y, normalize=False)
    return accuracy, classification_report


def autolabel(rects, ax):
    """
    Attach a text label above each bar displaying its height
    """
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                '%d' % int(height),
                ha='center', va='bottom')


def export_results(kernels, accuracies, reports, n_examples, plt):
    # Classification Reports
    for i, this_kernel in enumerate(kernels):
        df = pd.DataFrame(reports[i]).T
        df.to_csv(this_kernel+'_report.csv', sep='\t')

    # Correct Predictions Graph
    print ("Correct Predictions Graph...")
    y_pos = np.arange(len(kernels))
    fig, ax = plt.subplots()
    rects = ax.bar(y_pos, accuracies, color='b', align='center', alpha=0.5)
    plt.xticks(y_pos, [x.upper() for x in kernels])
    ax.set_ylim([0, n_examples])
    plt.xlabel('Kernels')
    plt.ylabel('Examples correctly predicted')
    plt.title('Correct Predictions (Total Examples: {})'.format(n_examples))
    autolabel(rects, ax)
    plt.show()
    fig.savefig('correct_predictions.png')

    # Accuracy Graph
    print ("Plotting Accuracy Graph...")
    accuracies = [(accuracy/n_examples)*100 for accuracy in accuracies]
    y_pos = np.arange(len(kernels))
    fig, ax = plt.subplots()
    rects = ax.bar(y_pos, accuracies, color='b', align='center', alpha=0.5)
    plt.xticks(y_pos, [x.upper() for x in kernels])
    ax.set_ylim([0, 100])
    plt.xlabel('Kernels')
    plt.ylabel('Accuracy in %')
    plt.title('Accuracy')
    autolabel(rects, ax)
    plt.show()
    fig.savefig('accuracy_percentage.png')


if __name__ == "__main__":
    print ("\n" + "="*30 + " Start " + "="*30 + "\n")
    print ("Preprocessing Data...")
    train_x, train_y, test_x, test_y = create_feature_sets_and_labels(pos, neg, test_size=0.3)  # 30% of the data for testing/70% for training
    """
        Data:
            Total number of examples = 10632
            Number of features      = 423
            Total number of classes = 2
            (0 and 1; denoting 'Positive' or 'Negative' Sentiment)
            Training Split          = 7464
            Testing Split           = 3198
    """
    target_names = ['Positive', 'Negative']

    print ("Training Examples: {} | Training Labels: {}".format(len(train_x), len(train_y)))
    print ("Testing Examples : {} | Testing Labels : {}".format(len(test_x), len(test_y)))

    print ("Applying Support Vector Machines...")
    kernels = ('linear', 'poly', 'rbf', 'sigmoid')                  # List of kernels
    accuracies = []                                                 # List of accuracies
    reports = []                                                    # List of reports
    for kernel in kernels:
        print ("\nUsing Kernel: {}".format(kernel))
        print ("\tStep 1/3: Creating Model")
        if (kernel=='poly'):
            model = SVC(C=10, kernel=kernel, gamma='scale', degree=2)
        elif (kernel=='linear'):
            model = SVC(C=10, kernel=kernel)
        else:
            model = SVC(C=10, kernel=kernel, gamma='scale')
        print ("\tStep 2/3: Training Model")
        model = train_model(model, train_x, train_y)
        print ("\tStep 3/3: Evaluating Model")
        accuracy, report = evaluate_model(model, test_x, test_y)
        accuracies.append(accuracy)
        reports.append(report)

    export_results(kernels, accuracies, reports, len(test_x), plt)
    print ("\n" + "="*30 + " End " + "="*30 + "\n")