Guest User

Untitled

a guest
Jan 23rd, 2018
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.55 KB | None | 0 0
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. from sklearn.linear_model import LogisticRegression
  3. from sklearn.pipeline import Pipeline
  4. from time import time
  5.  
  6. def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
  7. if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
  8. null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
  9. else:
  10. null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
  11. t0 = time()
  12. sentiment_fit = pipeline.fit(x_train, y_train)
  13. y_pred = sentiment_fit.predict(x_test)
  14. train_test_time = time() - t0
  15. accuracy = accuracy_score(y_test, y_pred)
  16. print "null accuracy: {0:.2f}%".format(null_accuracy*100)
  17. print "accuracy score: {0:.2f}%".format(accuracy*100)
  18. if accuracy > null_accuracy:
  19. print "model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100)
  20. elif accuracy == null_accuracy:
  21. print "model has the same accuracy with the null accuracy"
  22. else:
  23. print "model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100)
  24. print "train and test time: {0:.2f}s".format(train_test_time)
  25. print "-"*80
  26. return accuracy, train_test_time
  27.  
  28. cvec = CountVectorizer()
  29. lr = LogisticRegression()
  30. n_features = np.arange(10000,100001,10000)
  31.  
  32. def nfeature_accuracy_checker(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=lr):
  33. result = []
  34. print (classifier)
  35. print "\n"
  36. for n in n_features:
  37. vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
  38. checker_pipeline = Pipeline([
  39. ('vectorizer', vectorizer),
  40. ('classifier', classifier)
  41. ])
  42. print "Validation result for {} features".format(n)
  43. nfeature_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
  44. result.append((n,nfeature_accuracy,tt_time))
  45. return result
  46.  
  47. from sklearn.feature_extraction.text import TfidfVectorizer
  48. tvec = TfidfVectorizer()
  49.  
  50. feature_result_ugt = nfeature_accuracy_checker(vectorizer=tvec)
  51. feature_result_bgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 2))
  52. feature_result_tgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 3))
  53.  
  54.  
  55. nfeatures_plot_tgt = pd.DataFrame(feature_result_tgt,columns=['nfeatures','validation_accuracy','train_test_time'])
  56. nfeatures_plot_bgt = pd.DataFrame(feature_result_bgt,columns=['nfeatures','validation_accuracy','train_test_time'])
  57. nfeatures_plot_ugt = pd.DataFrame(feature_result_ugt,columns=['nfeatures','validation_accuracy','train_test_time'])
  58. plt.figure(figsize=(8,6))
  59. plt.plot(nfeatures_plot_tgt.nfeatures, nfeatures_plot_tgt.validation_accuracy,label='trigram tfidf vectorizer',color='royalblue')
  60. plt.plot(nfeatures_plot_tg.nfeatures, nfeatures_plot_tg.validation_accuracy,label='trigram count vectorizer',linestyle=':', color='royalblue')
  61. plt.plot(nfeatures_plot_bgt.nfeatures, nfeatures_plot_bgt.validation_accuracy,label='bigram tfidf vectorizer',color='orangered')
  62. plt.plot(nfeatures_plot_bg.nfeatures, nfeatures_plot_bg.validation_accuracy,label='bigram count vectorizer',linestyle=':',color='orangered')
  63. plt.plot(nfeatures_plot_ugt.nfeatures, nfeatures_plot_ugt.validation_accuracy, label='unigram tfidf vectorizer',color='gold')
  64. plt.plot(nfeatures_plot_ug.nfeatures, nfeatures_plot_ug.validation_accuracy, label='unigram count vectorizer',linestyle=':',color='gold')
  65. plt.title("N-gram(1~3) test result : Accuracy")
  66. plt.xlabel("Number of features")
  67. plt.ylabel("Validation set accuracy")
  68. plt.legend()
Add Comment
Please, Sign In to add comment