Untitled

from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.cross_validation import train_test_split


# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# parse commandline arguments
op = OptionParser()
op.add_option("--report",
              action="store_true", dest="print_report",
              help="Print a detailed classification report.")
op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")
op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")
op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class"
                   " for every classifier.")
op.add_option("--all_categories",
              action="store_true", dest="all_categories",
              help="Whether to use all categories or not.")
op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")
op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")
op.add_option("--filtered",
              action="store_true",
              help="Remove newsgroup information that is easily overfit: "
                   "headers, signatures, and quoting.")

###############################################################################
# Load some categories from the training set

print("Loading dataset ")

# data_train = fetch_20newsgroups(subset='train', categories=categories,
#                                 shuffle=True, random_state=42,
#                                 remove=remove)

# data_test = fetch_20newsgroups(subset='test', categories=categories,
#                                shuffle=True, random_state=42,
#                                remove=remove)
dataset = load_files('./TED_dataset/Topics/')
train, test = train_test_split(dataset, train_size = 0.8)
# categories = data_train.target_names    # for case categories == None


# def size_mb(docs):
#     return sum(len(s.encode('utf-8')) for s in docs) / 1e6

# data_train_size_mb = size_mb(data_train.data)
# data_test_size_mb = size_mb(data_test.data)

# print("%d documents - %0.3fMB (training set)" % (
#     len(data_train.data), data_train_size_mb))
# print("%d documents - %0.3fMB (test set)" % (
#     len(data_test.data), data_test_size_mb))
# print("%d categories" % len(categories))
# print()

# # split a training set and a test set
# y_train, y_test = data_train.target, data_test.target

# print("Extracting features from the training data using a sparse vectorizer")
# t0 = time()
# if opts.use_hashing:
#     vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
#                                    n_features=opts.n_features)
#     X_train = vectorizer.transform(data_train.data)
# else:
#     vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
#                                  stop_words='english')
#     X_train = vectorizer.fit_transform(data_train.data)
# duration = time() - t0
# print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
# print("n_samples: %d, n_features: %d" % X_train.shape)
# print()

# print("Extracting features from the test data using the same vectorizer")
# t0 = time()
# X_test = vectorizer.transform(data_test.data)
# duration = time() - t0
# print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
# print("n_samples: %d, n_features: %d" % X_test.shape)
# print()

# # mapping from integer feature name to original token string
# if opts.use_hashing:
#     feature_names = None
# else:
#     feature_names = vectorizer.get_feature_names()

# if opts.select_chi2:
#     print("Extracting %d best features by a chi-squared test" %
#           opts.select_chi2)
#     t0 = time()
#     ch2 = SelectKBest(chi2, k=opts.select_chi2)
#     X_train = ch2.fit_transform(X_train, y_train)
#     X_test = ch2.transform(X_test)
#     if feature_names:
#         # keep selected feature names
#         feature_names = [feature_names[i] for i
#                          in ch2.get_support(indices=True)]
#     print("done in %fs" % (time() - t0))
#     print()

# if feature_names:
#     feature_names = np.asarray(feature_names)


# def trim(s):
#     """Trim string to fit on terminal (assuming 80-column display)"""
#     return s if len(s) <= 80 else s[:77] + "..."


# ###############################################################################
# # Benchmark classifiers
# def benchmark(clf):
#     print('_' * 80)
#     print("Training: ")
#     print(clf)
#     t0 = time()
#     clf.fit(X_train, y_train)
#     train_time = time() - t0
#     print("train time: %0.3fs" % train_time)

#     t0 = time()
#     pred = clf.predict(X_test)
#     test_time = time() - t0
#     print("test time:  %0.3fs" % test_time)

#     score = metrics.accuracy_score(y_test, pred)
#     print("accuracy:   %0.3f" % score)

#     if hasattr(clf, 'coef_'):
#         print("dimensionality: %d" % clf.coef_.shape[1])
#         print("density: %f" % density(clf.coef_))

#         if opts.print_top10 and feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

#     if opts.print_report:
#         print("classification report:")
#         print(metrics.classification_report(y_test, pred,
#                                             target_names=categories))

#     if opts.print_cm:
#         print("confusion matrix:")
#         print(metrics.confusion_matrix(y_test, pred))

#     print()
#     clf_descr = str(clf).split('(')[0]
#     return clf_descr, score, train_time, test_time


# results = []
# for clf, name in (
#         (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
#         (Perceptron(n_iter=50), "Perceptron"),
#         (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
#         (KNeighborsClassifier(n_neighbors=10), "kNN"),
#         (RandomForestClassifier(n_estimators=100), "Random forest")):
#     print('=' * 80)
#     print(name)
#     results.append(benchmark(clf))

# for penalty in ["l2", "l1"]:
#     print('=' * 80)
#     print("%s penalty" % penalty.upper())
#     # Train Liblinear model
#     results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
#                                             dual=False, tol=1e-3)))

#     # Train SGD model
#     results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
#                                            penalty=penalty)))

# # Train SGD with Elastic Net penalty
# print('=' * 80)
# print("Elastic-Net penalty")
# results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
#                                        penalty="elasticnet")))

# # Train NearestCentroid without threshold
# print('=' * 80)
# print("NearestCentroid (aka Rocchio classifier)")
# results.append(benchmark(NearestCentroid()))

# # Train sparse Naive Bayes classifiers
# print('=' * 80)
# print("Naive Bayes")
# results.append(benchmark(MultinomialNB(alpha=.01)))
# results.append(benchmark(BernoulliNB(alpha=.01)))

# print('=' * 80)
# print("LinearSVC with L1-based feature selection")
# # The smaller C, the stronger the regularization.
# # The more regularization, the more sparsity.
# results.append(benchmark(Pipeline([
#   ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
#   ('classification', LinearSVC())
# ])))

# # make some plots

# indices = np.arange(len(results))

# results = [[x[i] for x in results] for i in range(4)]

# clf_names, score, training_time, test_time = results
# training_time = np.array(training_time) / np.max(training_time)
# test_time = np.array(test_time) / np.max(test_time)

# plt.figure(figsize=(12, 8))
# plt.title("Score")
# plt.barh(indices, score, .2, label="score", color='r')
# plt.barh(indices + .3, training_time, .2, label="training time", color='g')
# plt.barh(indices + .6, test_time, .2, label="test time", color='b')
# plt.yticks(())
# plt.legend(loc='best')
# plt.subplots_adjust(left=.25)
# plt.subplots_adjust(top=.95)
# plt.subplots_adjust(bottom=.05)

# for i, c in zip(indices, clf_names):
#     plt.text(-.3, i, c)

# plt.show()