Advertisement
Guest User

Untitled

a guest
Nov 9th, 2015
680
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.53 KB | None | 0 0
  1. from __future__ import print_function
  2.  
  3. import logging
  4. import numpy as np
  5. from optparse import OptionParser
  6. import sys
  7. from time import time
  8. import matplotlib.pyplot as plt
  9. from sklearn.datasets import load_files
  10. from sklearn.datasets import fetch_20newsgroups
  11. from sklearn.feature_extraction.text import TfidfVectorizer
  12. from sklearn.feature_extraction.text import HashingVectorizer
  13. from sklearn.feature_selection import SelectKBest, chi2
  14. from sklearn.linear_model import RidgeClassifier
  15. from sklearn.pipeline import Pipeline
  16. from sklearn.svm import LinearSVC
  17. from sklearn.linear_model import SGDClassifier
  18. from sklearn.linear_model import Perceptron
  19. from sklearn.linear_model import PassiveAggressiveClassifier
  20. from sklearn.naive_bayes import BernoulliNB, MultinomialNB
  21. from sklearn.neighbors import KNeighborsClassifier
  22. from sklearn.neighbors import NearestCentroid
  23. from sklearn.ensemble import RandomForestClassifier
  24. from sklearn.utils.extmath import density
  25. from sklearn import metrics
  26.  
  27.  
  28. # Display progress logs on stdout
  29. logging.basicConfig(level=logging.INFO,
  30.                     format='%(asctime)s %(levelname)s %(message)s')
  31.  
  32.  
  33. # parse commandline arguments
  34. op = OptionParser()
  35. op.add_option("--report",
  36.               action="store_true", dest="print_report",
  37.               help="Print a detailed classification report.")
  38. op.add_option("--chi2_select",
  39.               action="store", type="int", dest="select_chi2",
  40.               help="Select some number of features using a chi-squared test")
  41. op.add_option("--confusion_matrix",
  42.               action="store_true", dest="print_cm",
  43.               help="Print the confusion matrix.")
  44. op.add_option("--top10",
  45.               action="store_true", dest="print_top10",
  46.               help="Print ten most discriminative terms per class"
  47.                    " for every classifier.")
  48. op.add_option("--all_categories",
  49.               action="store_true", dest="all_categories",
  50.               help="Whether to use all categories or not.")
  51. op.add_option("--use_hashing",
  52.               action="store_true",
  53.               help="Use a hashing vectorizer.")
  54. op.add_option("--n_features",
  55.               action="store", type=int, default=2 ** 16,
  56.               help="n_features when using the hashing vectorizer.")
  57. op.add_option("--filtered",
  58.               action="store_true",
  59.               help="Remove newsgroup information that is easily overfit: "
  60.                    "headers, signatures, and quoting.")
  61.  
  62. (opts, args) = op.parse_args()
  63. if len(args) > 0:
  64.     op.error("this script takes no arguments.")
  65.     sys.exit(1)
  66.  
  67. print(__doc__)
  68. op.print_help()
  69. print()
  70.  
  71.  
  72. ###############################################################################
  73. # Load some categories from the training set
  74.  
  75. print("Loading dataset ")
  76.  
  77. # data_train = fetch_20newsgroups(subset='train', categories=categories,
  78. #                                 shuffle=True, random_state=42,
  79. #                                 remove=remove)
  80.  
  81. # data_test = fetch_20newsgroups(subset='test', categories=categories,
  82. #                                shuffle=True, random_state=42,
  83. #                                remove=remove)
  84. dataset = load_files('./TED_dataset/Topics/')
  85. data_train, data_train, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=0)
  86. print('data loaded')
  87.  
  88. categories = data_train.target_names    # for case categories == None
  89.  
  90.  
  91. def size_mb(docs):
  92.     return sum(len(s.encode('utf-8')) for s in docs) / 1e6
  93.  
  94. data_train_size_mb = size_mb(data_train.data)
  95. data_test_size_mb = size_mb(data_test.data)
  96.  
  97. print("%d documents - %0.3fMB (training set)" % (
  98.     len(data_train.data), data_train_size_mb))
  99. print("%d documents - %0.3fMB (test set)" % (
  100.     len(data_test.data), data_test_size_mb))
  101. print("%d categories" % len(categories))
  102. print()
  103.  
  104. # split a training set and a test set
  105. y_train, y_test = data_train.target, data_test.target
  106.  
  107. print("Extracting features from the training data using a sparse vectorizer")
  108. t0 = time()
  109. if opts.use_hashing:
  110.     vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
  111.                                    n_features=opts.n_features)
  112.     X_train = vectorizer.transform(data_train.data)
  113. else:
  114.     vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
  115.                                  stop_words='english')
  116.     X_train = vectorizer.fit_transform(data_train.data)
  117. duration = time() - t0
  118. print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
  119. print("n_samples: %d, n_features: %d" % X_train.shape)
  120. print()
  121.  
  122. print("Extracting features from the test data using the same vectorizer")
  123. t0 = time()
  124. X_test = vectorizer.transform(data_test.data)
  125. duration = time() - t0
  126. print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
  127. print("n_samples: %d, n_features: %d" % X_test.shape)
  128. print()
  129.  
  130. # mapping from integer feature name to original token string
  131. if opts.use_hashing:
  132.     feature_names = None
  133. else:
  134.     feature_names = vectorizer.get_feature_names()
  135.  
  136. if opts.select_chi2:
  137.     print("Extracting %d best features by a chi-squared test" %
  138.           opts.select_chi2)
  139.     t0 = time()
  140.     ch2 = SelectKBest(chi2, k=opts.select_chi2)
  141.     X_train = ch2.fit_transform(X_train, y_train)
  142.     X_test = ch2.transform(X_test)
  143.     if feature_names:
  144.         # keep selected feature names
  145.         feature_names = [feature_names[i] for i
  146.                          in ch2.get_support(indices=True)]
  147.     print("done in %fs" % (time() - t0))
  148.     print()
  149.  
  150. if feature_names:
  151.     feature_names = np.asarray(feature_names)
  152.  
  153.  
  154. def trim(s):
  155.     """Trim string to fit on terminal (assuming 80-column display)"""
  156.     return s if len(s) <= 80 else s[:77] + "..."
  157.  
  158.  
  159. ###############################################################################
  160. # Benchmark classifiers
  161. def benchmark(clf):
  162.     print('_' * 80)
  163.     print("Training: ")
  164.     print(clf)
  165.     t0 = time()
  166.     clf.fit(X_train, y_train)
  167.     train_time = time() - t0
  168.     print("train time: %0.3fs" % train_time)
  169.  
  170.     t0 = time()
  171.     pred = clf.predict(X_test)
  172.     test_time = time() - t0
  173.     print("test time:  %0.3fs" % test_time)
  174.  
  175.     score = metrics.accuracy_score(y_test, pred)
  176.     print("accuracy:   %0.3f" % score)
  177.  
  178.     if hasattr(clf, 'coef_'):
  179.         print("dimensionality: %d" % clf.coef_.shape[1])
  180.         print("density: %f" % density(clf.coef_))
  181.  
  182.         if opts.print_top10 and feature_names is not None:
  183.             print("top 10 keywords per class:")
  184.             for i, category in enumerate(categories):
  185.                 top10 = np.argsort(clf.coef_[i])[-10:]
  186.                 print(trim("%s: %s"
  187.                       % (category, " ".join(feature_names[top10]))))
  188.         print()
  189.  
  190.     if opts.print_report:
  191.         print("classification report:")
  192.         print(metrics.classification_report(y_test, pred,
  193.                                             target_names=categories))
  194.  
  195.     if opts.print_cm:
  196.         print("confusion matrix:")
  197.         print(metrics.confusion_matrix(y_test, pred))
  198.  
  199.     print()
  200.     clf_descr = str(clf).split('(')[0]
  201.     return clf_descr, score, train_time, test_time
  202.  
  203.  
  204. results = []
  205. for clf, name in (
  206.         (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
  207.         (Perceptron(n_iter=50), "Perceptron"),
  208.         (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
  209.         (KNeighborsClassifier(n_neighbors=10), "kNN"),
  210.         (RandomForestClassifier(n_estimators=100), "Random forest")):
  211.     print('=' * 80)
  212.     print(name)
  213.     results.append(benchmark(clf))
  214.  
  215. for penalty in ["l2", "l1"]:
  216.     print('=' * 80)
  217.     print("%s penalty" % penalty.upper())
  218.     # Train Liblinear model
  219.     results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
  220.                                             dual=False, tol=1e-3)))
  221.  
  222.     # Train SGD model
  223.     results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
  224.                                            penalty=penalty)))
  225.  
  226. # Train SGD with Elastic Net penalty
  227. print('=' * 80)
  228. print("Elastic-Net penalty")
  229. results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
  230.                                        penalty="elasticnet")))
  231.  
  232. # Train NearestCentroid without threshold
  233. print('=' * 80)
  234. print("NearestCentroid (aka Rocchio classifier)")
  235. results.append(benchmark(NearestCentroid()))
  236.  
  237. # Train sparse Naive Bayes classifiers
  238. print('=' * 80)
  239. print("Naive Bayes")
  240. results.append(benchmark(MultinomialNB(alpha=.01)))
  241. results.append(benchmark(BernoulliNB(alpha=.01)))
  242.  
  243. print('=' * 80)
  244. print("LinearSVC with L1-based feature selection")
  245. # The smaller C, the stronger the regularization.
  246. # The more regularization, the more sparsity.
  247. results.append(benchmark(Pipeline([
  248.   ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  249.   ('classification', LinearSVC())
  250. ])))
  251.  
  252. # make some plots
  253.  
  254. indices = np.arange(len(results))
  255.  
  256. results = [[x[i] for x in results] for i in range(4)]
  257.  
  258. clf_names, score, training_time, test_time = results
  259. training_time = np.array(training_time) / np.max(training_time)
  260. test_time = np.array(test_time) / np.max(test_time)
  261.  
  262. plt.figure(figsize=(12, 8))
  263. plt.title("Score")
  264. plt.barh(indices, score, .2, label="score", color='r')
  265. plt.barh(indices + .3, training_time, .2, label="training time", color='g')
  266. plt.barh(indices + .6, test_time, .2, label="test time", color='b')
  267. plt.yticks(())
  268. plt.legend(loc='best')
  269. plt.subplots_adjust(left=.25)
  270. plt.subplots_adjust(top=.95)
  271. plt.subplots_adjust(bottom=.05)
  272.  
  273. for i, c in zip(indices, clf_names):
  274.     plt.text(-.3, i, c)
  275.  
  276. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement