Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.linear_model import LogisticRegression
- from sklearn.pipeline import Pipeline
- from time import time
- def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
- if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
- null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
- else:
- null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
- t0 = time()
- sentiment_fit = pipeline.fit(x_train, y_train)
- y_pred = sentiment_fit.predict(x_test)
- train_test_time = time() - t0
- accuracy = accuracy_score(y_test, y_pred)
- print "null accuracy: {0:.2f}%".format(null_accuracy*100)
- print "accuracy score: {0:.2f}%".format(accuracy*100)
- if accuracy > null_accuracy:
- print "model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100)
- elif accuracy == null_accuracy:
- print "model has the same accuracy with the null accuracy"
- else:
- print "model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100)
- print "train and test time: {0:.2f}s".format(train_test_time)
- print "-"*80
- return accuracy, train_test_time
- cvec = CountVectorizer()
- lr = LogisticRegression()
- n_features = np.arange(10000,100001,10000)
- def nfeature_accuracy_checker(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=lr):
- result = []
- print (classifier)
- print "\n"
- for n in n_features:
- vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
- checker_pipeline = Pipeline([
- ('vectorizer', vectorizer),
- ('classifier', classifier)
- ])
- print "Validation result for {} features".format(n)
- nfeature_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
- result.append((n,nfeature_accuracy,tt_time))
- return result
- from sklearn.feature_extraction.text import TfidfVectorizer
- tvec = TfidfVectorizer()
- feature_result_ugt = nfeature_accuracy_checker(vectorizer=tvec)
- feature_result_bgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 2))
- feature_result_tgt = nfeature_accuracy_checker(vectorizer=tvec,ngram_range=(1, 3))
- nfeatures_plot_tgt = pd.DataFrame(feature_result_tgt,columns=['nfeatures','validation_accuracy','train_test_time'])
- nfeatures_plot_bgt = pd.DataFrame(feature_result_bgt,columns=['nfeatures','validation_accuracy','train_test_time'])
- nfeatures_plot_ugt = pd.DataFrame(feature_result_ugt,columns=['nfeatures','validation_accuracy','train_test_time'])
- plt.figure(figsize=(8,6))
- plt.plot(nfeatures_plot_tgt.nfeatures, nfeatures_plot_tgt.validation_accuracy,label='trigram tfidf vectorizer',color='royalblue')
- plt.plot(nfeatures_plot_tg.nfeatures, nfeatures_plot_tg.validation_accuracy,label='trigram count vectorizer',linestyle=':', color='royalblue')
- plt.plot(nfeatures_plot_bgt.nfeatures, nfeatures_plot_bgt.validation_accuracy,label='bigram tfidf vectorizer',color='orangered')
- plt.plot(nfeatures_plot_bg.nfeatures, nfeatures_plot_bg.validation_accuracy,label='bigram count vectorizer',linestyle=':',color='orangered')
- plt.plot(nfeatures_plot_ugt.nfeatures, nfeatures_plot_ugt.validation_accuracy, label='unigram tfidf vectorizer',color='gold')
- plt.plot(nfeatures_plot_ug.nfeatures, nfeatures_plot_ug.validation_accuracy, label='unigram count vectorizer',linestyle=':',color='gold')
- plt.title("N-gram(1~3) test result : Accuracy")
- plt.xlabel("Number of features")
- plt.ylabel("Validation set accuracy")
- plt.legend()
Add Comment
Please, Sign In to add comment