Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import pandas
- import sklearn
- import numpy as np
- from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
- from sklearn.naive_bayes import MultinomialNB, BernoulliNB
- from sklearn.svm import LinearSVC
- from sklearn.metrics import confusion_matrix,accuracy_score
- from sklearn.pipeline import make_pipeline
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.model_selection import train_test_split
- from sklearn.decomposition import LatentDirichletAllocation
- def tfidf(x_train,x_test,y_train,y_test):
- cv = CountVectorizer(stop_words='english')
- cv.fit_transform(x_train)
- freq_term_matrix = cv.transform(x_train)
- tf = TfidfTransformer()
- tf.fit_transform(freq_term_matrix)
- tf_idf_matrix = tf.transform(freq_term_matrix)
- train_label = np.zeros(len(y_train))
- for i in range(len(y_train)):
- if (y_train[i] == 'spam'):
- train_label[i] = 1
- cv_test = CountVectorizer(stop_words='english')
- cv_test.fit_transform(x_test)
- freq_term_matrix_test = cv.transform(x_test)
- tf_test = TfidfTransformer()
- tf_test.fit_transform(freq_term_matrix_test)
- tf_idf_matrix_test = tf_test.transform(freq_term_matrix_test)
- test_label = np.zeros(len(y_test))
- for j in range(len(y_test)):
- if (y_test[j] == 'spam'):
- test_label[j] = 1
- model1 = LinearSVC()
- model2 = MultinomialNB()
- model3 = BernoulliNB()
- model4 = DecisionTreeClassifier()
- model1.fit(tf_idf_matrix.todense(), train_label)
- model2.fit(tf_idf_matrix.todense(), train_label)
- model3.fit(tf_idf_matrix.todense(), train_label)
- model4.fit(tf_idf_matrix.todense(), train_label)
- result1 = model1.predict(tf_idf_matrix_test.todense())
- result2 = model2.predict(tf_idf_matrix_test.todense())
- result3 = model3.predict(tf_idf_matrix_test.todense())
- result4 = model4.predict(tf_idf_matrix_test.todense())
- mat = confusion_matrix(test_label, result1)
- mat1 = confusion_matrix(test_label, result2)
- mat2 = confusion_matrix(test_label, result3)
- mat3 = confusion_matrix(test_label, result4)
- x = [[0 for x in range(4)] for y in range(5)]
- x[0][0] = 'FD/M'
- x[1][0] = 'SVM'
- x[2][0] = 'MNB'
- x[3][0] = 'BNB'
- x[4][0] = 'DTC'
- x[0][1] = 'acc'
- x[0][2] = 'ham'
- x[0][3] = 'spam'
- x[1][1] = accuracy_score(test_label, result1)
- x[2][1] = accuracy_score(test_label, result2)
- x[3][1] = accuracy_score(test_label, result3)
- x[4][1] = accuracy_score(test_label, result4)
- x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
- x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
- x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
- x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
- x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
- x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
- x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
- x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
- for i in range(len(x)):
- for j in range(len(x[i])):
- if ((i == 0) or (j == 0)):
- print(x[i][j], end=' ')
- else:
- print(np.around(x[i][j], decimals=2), end=' ')
- print()
- def withStop(x_train,x_test,y_train,y_test):
- cv = CountVectorizer(stop_words='english')
- cv.fit_transform(x_train)
- freq_term_matrix = cv.transform(x_train)
- train_label = np.zeros(len(y_train))
- for i in range(len(y_train)):
- if (y_train[i] == 'spam'):
- train_label[i] = 1
- cv_test = CountVectorizer(stop_words='english')
- cv_test.fit_transform(x_test)
- freq_term_matrix_test = cv.transform(x_test)
- test_label = np.zeros(len(y_test))
- for j in range(len(y_test)):
- if (y_test[j] == 'spam'):
- test_label[j] = 1
- model1 = LinearSVC()
- model2 = MultinomialNB()
- model3 = BernoulliNB()
- model4 = DecisionTreeClassifier()
- model1.fit(freq_term_matrix.todense(), train_label)
- model2.fit(freq_term_matrix.todense(), train_label)
- model3.fit(freq_term_matrix.todense(), train_label)
- model4.fit(freq_term_matrix.todense(), train_label)
- result1 = model1.predict( freq_term_matrix_test.todense())
- result2 = model2.predict( freq_term_matrix_test.todense())
- result3 = model3.predict( freq_term_matrix_test.todense())
- result4 = model4.predict( freq_term_matrix_test.todense())
- mat = confusion_matrix(test_label, result1)
- mat1 = confusion_matrix(test_label, result2)
- mat2 = confusion_matrix(test_label, result3)
- mat3 = confusion_matrix(test_label, result4)
- x = [[0 for x in range(4)] for y in range(5)]
- x[0][0] = 'FD/M'
- x[1][0] = 'SVM'
- x[2][0] = 'MNB'
- x[3][0] = 'BNB'
- x[4][0] = 'DTC'
- x[0][1] = 'acc'
- x[0][2] = 'ham'
- x[0][3] = 'spam'
- x[1][1] = accuracy_score(test_label, result1)
- x[2][1] = accuracy_score(test_label, result2)
- x[3][1] = accuracy_score(test_label, result3)
- x[4][1] = accuracy_score(test_label, result4)
- x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
- x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
- x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
- x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
- x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
- x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
- x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
- x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
- for i in range(len(x)):
- for j in range(len(x[i])):
- if ((i == 0) or (j == 0)):
- print(x[i][j], end=' ')
- else:
- print(np.around(x[i][j], decimals=2), end=' ')
- print()
- def withoutStop(x_train,x_test,y_train,y_test):
- cv = CountVectorizer()
- cv.fit_transform(x_train)
- freq_term = cv.transform(x_train)
- train_label = np.zeros(len(y_train))
- for i in range(len(y_train)):
- if (y_train[i] == 'spam'):
- train_label[i] = 1
- cv_test = CountVectorizer()
- cv_test.fit_transform(x_test)
- freq_term_test = cv.transform(x_test)
- test_label = np.zeros(len(y_test))
- for j in range(len(y_test)):
- if (y_test[j] == 'spam'):
- test_label[j] = 1
- model1 = LinearSVC()
- model2 = MultinomialNB()
- model3 = BernoulliNB()
- model4 = DecisionTreeClassifier()
- model1.fit( freq_term.todense(), train_label)
- model2.fit( freq_term.todense(), train_label)
- model3.fit( freq_term.todense(), train_label)
- model4.fit( freq_term.todense(), train_label)
- result1 = model1.predict( freq_term_test.todense())
- result2 = model2.predict( freq_term_test.todense())
- result3 = model3.predict( freq_term_test.todense())
- result4 = model4.predict( freq_term_test.todense())
- mat = confusion_matrix(test_label, result1)
- mat1 = confusion_matrix(test_label, result2)
- mat2 = confusion_matrix(test_label, result3)
- mat3 = confusion_matrix(test_label, result4)
- x = [[0 for x in range(4)] for y in range(5)]
- x[0][0] = 'FD/M'
- x[1][0] = 'SVM'
- x[2][0] = 'MNB'
- x[3][0] = 'BNB'
- x[4][0] = 'DTC'
- x[0][1] = 'acc'
- x[0][2] = 'ham'
- x[0][3] = 'spam'
- x[1][1] = accuracy_score(test_label, result1)
- x[2][1] = accuracy_score(test_label, result2)
- x[3][1] = accuracy_score(test_label, result3)
- x[4][1] = accuracy_score(test_label, result4)
- x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
- x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
- x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
- x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
- x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
- x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
- x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
- x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
- for i in range(len(x)):
- for j in range(len(x[i])):
- if ((i == 0) or (j == 0)):
- print(x[i][j], end=' ')
- else:
- print(np.around(x[i][j], decimals=2), end=' ')
- print()
- def LDA(x_train,x_test,y_train,y_test):
- dv = CountVectorizer(stop_words='english')
- dv.fit_transform(x_train)
- freq_term = dv.transform(x_train)
- cv = LatentDirichletAllocation()
- cv.fit_transform(freq_term )
- freq_term_matrix= cv.transform(freq_term )
- train_label = np.zeros(len(y_train))
- for i in range(len(y_train)):
- if (y_train[i] == 'spam'):
- train_label[i] = 1
- dv_test = CountVectorizer(stop_words='english')
- dv_test.fit_transform(x_test)
- freq_term_test = dv.transform(x_test)
- cv_test = LatentDirichletAllocation()
- cv_test.fit_transform(freq_term_test)
- freq_term_matrix_test = cv.transform(freq_term_test)
- test_label = np.zeros(len(y_test))
- for j in range(len(y_test)):
- if (y_test[j] == 'spam'):
- test_label[j] = 1
- model1 = LinearSVC()
- model2 = MultinomialNB()
- model3 = BernoulliNB()
- model4 = DecisionTreeClassifier()
- model1.fit(np.asmatrix(freq_term_matrix), train_label)
- model2.fit(np.asmatrix(freq_term_matrix), train_label)
- model3.fit(np.asmatrix(freq_term_matrix), train_label)
- model4.fit(np.asmatrix(freq_term_matrix), train_label)
- result1 = model1.predict(np.asmatrix(freq_term_matrix_test))
- result2 = model2.predict(np.asmatrix(freq_term_matrix_test))
- result3 = model3.predict(np.asmatrix(freq_term_matrix_test))
- result4 = model4.predict(np.asmatrix(freq_term_matrix_test))
- mat = confusion_matrix(test_label, result1)
- mat1 = confusion_matrix(test_label, result2)
- mat2 = confusion_matrix(test_label, result3)
- mat3 = confusion_matrix(test_label, result4)
- x = [[0 for x in range(4)] for y in range(5)]
- x[0][0] = 'FD/M'
- x[1][0] = 'SVM'
- x[2][0] = 'MNB'
- x[3][0] = 'BNB'
- x[4][0] = 'DTC'
- x[0][1] = 'acc'
- x[0][2] = 'ham'
- x[0][3] = 'spam'
- x[1][1] = accuracy_score(test_label, result1)
- x[2][1] = accuracy_score(test_label, result2)
- x[3][1] = accuracy_score(test_label, result3)
- x[4][1] = accuracy_score(test_label, result4)
- x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
- x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
- x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
- x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
- x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
- x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
- x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
- x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
- for i in range(len(x)):
- for j in range(len(x[i])):
- if ((i == 0) or (j == 0)):
- print(x[i][j], end=' ')
- else:
- print(np.around(x[i][j], decimals=2), end=' ')
- print()
- messages = pandas.read_csv('SMSSpamCollection.txt', sep='\t',
- names=["label", "data"])
- test = pandas.read_csv('testspam', sep='\t',
- names=["label", "data"])
- x_train = messages.data
- x_test = test.data
- y_train = messages.label
- y_test = test.label
- print("CountVectorizer without stopwords\n")
- withoutStop(x_train,x_test,y_train,y_test)
- print("\n")
- print("CountVectorizer without stopwords\n")
- withStop(x_train,x_test,y_train,y_test)
- print("\n")
- print("TfidfVectorizer\n")
- tfidf(x_train,x_test,y_train,y_test)
- print("\n")
- print("LDA\n")
- LDA(x_train,x_test,y_train,y_test)
- print("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement