Advertisement
Kinrin

v232

May 24th, 2018
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.40 KB | None | 0 0
  1. import csv
  2. import pandas
  3. import sklearn
  4. import numpy as np
  5. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  6. from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  7. from sklearn.svm import LinearSVC
  8. from sklearn.metrics import confusion_matrix,accuracy_score
  9. from sklearn.pipeline import make_pipeline
  10. from sklearn.tree import DecisionTreeClassifier
  11. from sklearn.feature_extraction.text import TfidfVectorizer
  12. from sklearn.model_selection import train_test_split
  13. from sklearn.decomposition import LatentDirichletAllocation
  14. def tfidf(x_train,x_test,y_train,y_test):
  15.     cv = CountVectorizer(stop_words='english')
  16.     cv.fit_transform(x_train)
  17.     freq_term_matrix = cv.transform(x_train)
  18.     tf = TfidfTransformer()
  19.     tf.fit_transform(freq_term_matrix)
  20.     tf_idf_matrix = tf.transform(freq_term_matrix)
  21.     train_label = np.zeros(len(y_train))
  22.     for i in range(len(y_train)):
  23.         if (y_train[i] == 'spam'):
  24.             train_label[i] = 1
  25.  
  26.     cv_test = CountVectorizer(stop_words='english')
  27.     cv_test.fit_transform(x_test)
  28.     freq_term_matrix_test = cv.transform(x_test)
  29.     tf_test = TfidfTransformer()
  30.     tf_test.fit_transform(freq_term_matrix_test)
  31.     tf_idf_matrix_test = tf_test.transform(freq_term_matrix_test)
  32.     test_label = np.zeros(len(y_test))
  33.     for j in range(len(y_test)):
  34.         if (y_test[j] == 'spam'):
  35.             test_label[j] = 1
  36.  
  37.     model1 = LinearSVC()
  38.     model2 = MultinomialNB()
  39.     model3 = BernoulliNB()
  40.     model4 = DecisionTreeClassifier()
  41.  
  42.     model1.fit(tf_idf_matrix.todense(), train_label)
  43.     model2.fit(tf_idf_matrix.todense(), train_label)
  44.     model3.fit(tf_idf_matrix.todense(), train_label)
  45.     model4.fit(tf_idf_matrix.todense(), train_label)
  46.  
  47.     result1 = model1.predict(tf_idf_matrix_test.todense())
  48.     result2 = model2.predict(tf_idf_matrix_test.todense())
  49.     result3 = model3.predict(tf_idf_matrix_test.todense())
  50.     result4 = model4.predict(tf_idf_matrix_test.todense())
  51.  
  52.     mat = confusion_matrix(test_label, result1)
  53.     mat1 = confusion_matrix(test_label, result2)
  54.     mat2 = confusion_matrix(test_label, result3)
  55.     mat3 = confusion_matrix(test_label, result4)
  56.  
  57.     x = [[0 for x in range(4)] for y in range(5)]
  58.     x[0][0] = 'FD/M'
  59.     x[1][0] = 'SVM'
  60.     x[2][0] = 'MNB'
  61.     x[3][0] = 'BNB'
  62.     x[4][0] = 'DTC'
  63.     x[0][1] = 'acc'
  64.     x[0][2] = 'ham'
  65.     x[0][3] = 'spam'
  66.     x[1][1] = accuracy_score(test_label, result1)
  67.     x[2][1] = accuracy_score(test_label, result2)
  68.     x[3][1] = accuracy_score(test_label, result3)
  69.     x[4][1] = accuracy_score(test_label, result4)
  70.     x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
  71.     x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
  72.     x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
  73.     x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
  74.     x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
  75.     x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
  76.     x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
  77.     x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
  78.  
  79.     for i in range(len(x)):
  80.         for j in range(len(x[i])):
  81.             if ((i == 0) or (j == 0)):
  82.                 print(x[i][j], end=' ')
  83.             else:
  84.                 print(np.around(x[i][j], decimals=2), end=' ')
  85.         print()
  86. def withStop(x_train,x_test,y_train,y_test):
  87.     cv = CountVectorizer(stop_words='english')
  88.     cv.fit_transform(x_train)
  89.     freq_term_matrix = cv.transform(x_train)
  90.     train_label = np.zeros(len(y_train))
  91.     for i in range(len(y_train)):
  92.         if (y_train[i] == 'spam'):
  93.             train_label[i] = 1
  94.  
  95.     cv_test = CountVectorizer(stop_words='english')
  96.     cv_test.fit_transform(x_test)
  97.     freq_term_matrix_test = cv.transform(x_test)
  98.     test_label = np.zeros(len(y_test))
  99.     for j in range(len(y_test)):
  100.         if (y_test[j] == 'spam'):
  101.             test_label[j] = 1
  102.  
  103.     model1 = LinearSVC()
  104.     model2 = MultinomialNB()
  105.     model3 = BernoulliNB()
  106.     model4 = DecisionTreeClassifier()
  107.  
  108.     model1.fit(freq_term_matrix.todense(), train_label)
  109.     model2.fit(freq_term_matrix.todense(), train_label)
  110.     model3.fit(freq_term_matrix.todense(), train_label)
  111.     model4.fit(freq_term_matrix.todense(), train_label)
  112.  
  113.     result1 = model1.predict( freq_term_matrix_test.todense())
  114.     result2 = model2.predict( freq_term_matrix_test.todense())
  115.     result3 = model3.predict( freq_term_matrix_test.todense())
  116.     result4 = model4.predict( freq_term_matrix_test.todense())
  117.  
  118.     mat = confusion_matrix(test_label, result1)
  119.     mat1 = confusion_matrix(test_label, result2)
  120.     mat2 = confusion_matrix(test_label, result3)
  121.     mat3 = confusion_matrix(test_label, result4)
  122.  
  123.     x = [[0 for x in range(4)] for y in range(5)]
  124.     x[0][0] = 'FD/M'
  125.     x[1][0] = 'SVM'
  126.     x[2][0] = 'MNB'
  127.     x[3][0] = 'BNB'
  128.     x[4][0] = 'DTC'
  129.     x[0][1] = 'acc'
  130.     x[0][2] = 'ham'
  131.     x[0][3] = 'spam'
  132.     x[1][1] = accuracy_score(test_label, result1)
  133.     x[2][1] = accuracy_score(test_label, result2)
  134.     x[3][1] = accuracy_score(test_label, result3)
  135.     x[4][1] = accuracy_score(test_label, result4)
  136.     x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
  137.     x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
  138.     x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
  139.     x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
  140.     x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
  141.     x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
  142.     x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
  143.     x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
  144.  
  145.     for i in range(len(x)):
  146.         for j in range(len(x[i])):
  147.             if ((i == 0) or (j == 0)):
  148.                 print(x[i][j], end=' ')
  149.             else:
  150.                 print(np.around(x[i][j], decimals=2), end=' ')
  151.         print()
  152. def withoutStop(x_train,x_test,y_train,y_test):
  153.     cv = CountVectorizer()
  154.     cv.fit_transform(x_train)
  155.     freq_term = cv.transform(x_train)
  156.     train_label = np.zeros(len(y_train))
  157.     for i in range(len(y_train)):
  158.         if (y_train[i] == 'spam'):
  159.             train_label[i] = 1
  160.  
  161.     cv_test = CountVectorizer()
  162.     cv_test.fit_transform(x_test)
  163.     freq_term_test = cv.transform(x_test)
  164.     test_label = np.zeros(len(y_test))
  165.     for j in range(len(y_test)):
  166.         if (y_test[j] == 'spam'):
  167.             test_label[j] = 1
  168.  
  169.     model1 = LinearSVC()
  170.     model2 = MultinomialNB()
  171.     model3 = BernoulliNB()
  172.     model4 = DecisionTreeClassifier()
  173.  
  174.     model1.fit( freq_term.todense(), train_label)
  175.     model2.fit( freq_term.todense(), train_label)
  176.     model3.fit( freq_term.todense(), train_label)
  177.     model4.fit( freq_term.todense(), train_label)
  178.  
  179.     result1 = model1.predict( freq_term_test.todense())
  180.     result2 = model2.predict( freq_term_test.todense())
  181.     result3 = model3.predict( freq_term_test.todense())
  182.     result4 = model4.predict( freq_term_test.todense())
  183.  
  184.     mat = confusion_matrix(test_label, result1)
  185.     mat1 = confusion_matrix(test_label, result2)
  186.     mat2 = confusion_matrix(test_label, result3)
  187.     mat3 = confusion_matrix(test_label, result4)
  188.  
  189.     x = [[0 for x in range(4)] for y in range(5)]
  190.     x[0][0] = 'FD/M'
  191.     x[1][0] = 'SVM'
  192.     x[2][0] = 'MNB'
  193.     x[3][0] = 'BNB'
  194.     x[4][0] = 'DTC'
  195.     x[0][1] = 'acc'
  196.     x[0][2] = 'ham'
  197.     x[0][3] = 'spam'
  198.     x[1][1] = accuracy_score(test_label, result1)
  199.     x[2][1] = accuracy_score(test_label, result2)
  200.     x[3][1] = accuracy_score(test_label, result3)
  201.     x[4][1] = accuracy_score(test_label, result4)
  202.     x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
  203.     x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
  204.     x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
  205.     x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
  206.     x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
  207.     x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
  208.     x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
  209.     x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
  210.  
  211.     for i in range(len(x)):
  212.         for j in range(len(x[i])):
  213.             if ((i == 0) or (j == 0)):
  214.                 print(x[i][j], end=' ')
  215.             else:
  216.                 print(np.around(x[i][j], decimals=2), end=' ')
  217.         print()
  218. def LDA(x_train,x_test,y_train,y_test):
  219.     dv = CountVectorizer(stop_words='english')
  220.     dv.fit_transform(x_train)
  221.     freq_term  = dv.transform(x_train)
  222.     cv = LatentDirichletAllocation()
  223.     cv.fit_transform(freq_term  )
  224.     freq_term_matrix= cv.transform(freq_term  )
  225.     train_label = np.zeros(len(y_train))
  226.     for i in range(len(y_train)):
  227.         if (y_train[i] == 'spam'):
  228.             train_label[i] = 1
  229.     dv_test = CountVectorizer(stop_words='english')
  230.     dv_test.fit_transform(x_test)
  231.     freq_term_test = dv.transform(x_test)
  232.     cv_test = LatentDirichletAllocation()
  233.     cv_test.fit_transform(freq_term_test)
  234.     freq_term_matrix_test = cv.transform(freq_term_test)
  235.     test_label = np.zeros(len(y_test))
  236.     for j in range(len(y_test)):
  237.         if (y_test[j] == 'spam'):
  238.             test_label[j] = 1
  239.  
  240.     model1 = LinearSVC()
  241.     model2 = MultinomialNB()
  242.     model3 = BernoulliNB()
  243.     model4 = DecisionTreeClassifier()
  244.  
  245.     model1.fit(np.asmatrix(freq_term_matrix), train_label)
  246.     model2.fit(np.asmatrix(freq_term_matrix), train_label)
  247.     model3.fit(np.asmatrix(freq_term_matrix), train_label)
  248.     model4.fit(np.asmatrix(freq_term_matrix), train_label)
  249.  
  250.     result1 = model1.predict(np.asmatrix(freq_term_matrix_test))
  251.     result2 = model2.predict(np.asmatrix(freq_term_matrix_test))
  252.     result3 = model3.predict(np.asmatrix(freq_term_matrix_test))
  253.     result4 = model4.predict(np.asmatrix(freq_term_matrix_test))
  254.  
  255.     mat = confusion_matrix(test_label, result1)
  256.     mat1 = confusion_matrix(test_label, result2)
  257.     mat2 = confusion_matrix(test_label, result3)
  258.     mat3 = confusion_matrix(test_label, result4)
  259.  
  260.     x = [[0 for x in range(4)] for y in range(5)]
  261.     x[0][0] = 'FD/M'
  262.     x[1][0] = 'SVM'
  263.     x[2][0] = 'MNB'
  264.     x[3][0] = 'BNB'
  265.     x[4][0] = 'DTC'
  266.     x[0][1] = 'acc'
  267.     x[0][2] = 'ham'
  268.     x[0][3] = 'spam'
  269.     x[1][1] = accuracy_score(test_label, result1)
  270.     x[2][1] = accuracy_score(test_label, result2)
  271.     x[3][1] = accuracy_score(test_label, result3)
  272.     x[4][1] = accuracy_score(test_label, result4)
  273.     x[1][2] = mat[0][0] / (mat[0][0] + mat[0][1])
  274.     x[2][2] = mat1[0][0] / (mat1[0][0] + mat1[0][1])
  275.     x[3][2] = mat2[0][0] / (mat2[0][0] + mat2[0][1])
  276.     x[4][2] = mat3[0][0] / (mat3[0][0] + mat3[0][1])
  277.     x[1][3] = mat[1][1] / (mat[1][0] + mat[1][1])
  278.     x[2][3] = mat1[1][1] / (mat1[1][0] + mat1[1][1])
  279.     x[3][3] = mat2[1][1] / (mat2[1][0] + mat2[1][1])
  280.     x[4][3] = mat3[1][1] / (mat3[1][0] + mat3[1][1])
  281.  
  282.     for i in range(len(x)):
  283.         for j in range(len(x[i])):
  284.             if ((i == 0) or (j == 0)):
  285.                 print(x[i][j], end=' ')
  286.             else:
  287.                 print(np.around(x[i][j], decimals=2), end=' ')
  288.         print()
  289.  
  290.  
  291. messages = pandas.read_csv('SMSSpamCollection.txt', sep='\t',
  292.                            names=["label", "data"])
  293. test = pandas.read_csv('testspam', sep='\t',
  294.                            names=["label", "data"])
  295. x_train = messages.data
  296. x_test = test.data
  297. y_train = messages.label
  298. y_test = test.label
  299. print("CountVectorizer without stopwords\n")
  300. withoutStop(x_train,x_test,y_train,y_test)
  301. print("\n")
  302. print("CountVectorizer without stopwords\n")
  303. withStop(x_train,x_test,y_train,y_test)
  304. print("\n")
  305. print("TfidfVectorizer\n")
  306. tfidf(x_train,x_test,y_train,y_test)
  307. print("\n")
  308. print("LDA\n")
  309. LDA(x_train,x_test,y_train,y_test)
  310. print("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement