Advertisement
markoczy

SpamFilter

Apr 2nd, 2018
259
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Octave 17.34 KB | None | 0 0
  1. % Spamfilter Octave 10.03.18
  2. % Vidushi Maillart, Marx Stampfli, Reto Spöhel
  3. %
  4. % Student: Aleistar Markóczy
  5. %
  6. % ==============================================================================
  7. % Function Reference
  8. %
  9. % - subplot(<rows>, <cols>, <index>): Display multiple plots in single figure
  10. % - hist(<input vector>, <bins>, .. <attr>, <value>)
  11. %     <bins> = [0:0.4:8] means "for x=0; x<=8; x+=0.4 { append(bins, x) }"
  12. % - csvread(<file>, <startY>, <startX>): Read csv file
  13. % - cvpartition(<vector>, <part_type>, <part_params>): Create partition
  14. %     e.g. Type = "HoldOut": Training set and Test set (param = size test)
  15. % - Create Vector from matrix: M(:, <col>) => Vector at <col>
  16. %
  17. %  Columns:
  18. %
  19. % "will","remove","you","free","charExclamation","charDollar","type"
  20. %    1      2       3      4        5                 6          7
  21. %
  22. %
  23. % ==============================================================================
  24. %
  25. % Encoding: UTF-8
  26. pkg load statistics
  27.  
  28.  
  29. % ==============================================================================
  30. % 4.0.a) Required Functions
  31. % ==============================================================================
  32.  
  33. % Histogram multiplot
  34. function multiplotHist(hist_title, sizeY, sizeX, idx, data, col, spam)
  35.     if (spam)
  36.     subplot(sizeY, sizeX, idx)
  37.         hist(data(data(:,7)==1, col), [0:0.4:8], 'facecolor', 'r')
  38.         title(hist_title)
  39.     else
  40.     subplot(sizeY, sizeX, idx)
  41.         hist(data(data(:,7)==0, col), [0:0.4:8], 'facecolor', 'b')
  42.         title(hist_title)
  43.     endif
  44. endfunction
  45.  
  46. % Single Attribute A{word, threshold} Confusion Matrix
  47. function CM = getCMSingleAttribute(data, row_select, column, thrshld)
  48.     % Group and sizes of selected sets
  49.     S         = sum(data(row_select, 7) == 1);
  50.     notS      = sum(data(row_select, 7) == 0);
  51.     E_S       = sum(data(row_select, column) > thrshld & data(row_select, 7) == 1);
  52.     E_notS    = sum(data(row_select, column) > thrshld & data(row_select, 7) == 0);
  53.     notE_S    = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 1);
  54.     notE_notS = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 0);
  55.     % Confusion Matrix
  56.     CM        = [E_S, E_notS; notE_S, notE_notS];
  57. endfunction
  58.  
  59. % Double Attribute A{word, threshold} Confusion Matrix
  60. function CM = getCMBiAttribute(data, row_select, column1, column2, thrshld)
  61.     % Group and sizes of selected sets
  62.     S         = sum(data(row_select, 7) == 1);
  63.     notS      = sum(data(row_select, 7) == 0);
  64.     E_S       = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 1);
  65.     E_notS    = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 0);
  66.     notE_S    = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 1);
  67.     notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 0);
  68.     % Confusion Matrix
  69.     CM        = [E_S, E_notS; notE_S, notE_notS];
  70. endfunction
  71.  
  72. % Triple Attribute A{word, threshold} Confusion Matrix
  73. function CM = getCMTriAttribute(data, row_select, column1, column2, column3, thrshld)
  74.     % Group and sizes of selected sets
  75.     S         = sum(data(row_select, 7) == 1);
  76.     notS      = sum(data(row_select, 7) == 0);
  77.     E_S       = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 1);
  78.     E_notS    = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 0);
  79.     notE_S    = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 1);
  80.     notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 0);
  81.     % Confusion Matrix
  82.     CM        = [E_S, E_notS; notE_S, notE_notS];
  83. endfunction
  84.  
  85. % Probabilities from a Confusion matrix
  86. function [p_E_S, p_E_notS] = getProbability(CM)
  87.     % Extract CM Variables
  88.     E_S = CM(1, 1);
  89.     E_notS = CM(1, 2);
  90.     notE_S = CM(2, 1);
  91.     notE_notS = CM(2, 2);
  92.     % Probability estimations
  93.     S = E_S + notE_S;
  94.     notS = E_notS + notE_notS;
  95.     p_E_S = E_S / S;
  96.     p_E_notS = E_notS / notS;
  97. endfunction
  98.  
  99. % Bayes Probability from a Confusion matrix and estimation factor
  100. function [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)
  101.     % Get raw probabilities
  102.     [p_E_S, p_E_notS] = getProbability(CM);
  103.     % Calc bayes probability with estimation factor f_test_E
  104.     inv_f = 1 - f_test_S;
  105.     p_S_E = (p_E_S * f_test_S) / (p_E_S * f_test_S + p_E_notS * inv_f);
  106. endfunction
  107.  
  108. function [p_S_AB, p_notS_AB] = getBiEventBayesProbability(CM_A, CM_B, f_test_S)
  109.    
  110.     [p_A_S, p_A_notS] = getProbability(CM_A);
  111.     [p_B_S, p_B_notS] = getProbability(CM_B);
  112.  
  113.     inv_f = 1 - f_test_S;
  114.     p_S_AB = (p_A_S * p_B_S * f_test_S) / (p_A_S * p_B_S * f_test_S + p_A_notS * p_B_notS * inv_f);
  115.     p_notS_AB = (p_A_notS * p_B_notS * inv_f) / (p_A_notS * p_B_notS * inv_f + p_A_S * p_B_S * f_test_S);
  116. endfunction
  117.  
  118. function [p_S_ABC, p_notS_ABC] = getTriEventBayesProbability(CM_A, CM_B, CM_C, f_test_S)
  119.    
  120.     [p_A_S, p_A_notS] = getProbability(CM_A);
  121.     [p_B_S, p_B_notS] = getProbability(CM_B);
  122.     [p_C_S, p_C_notS] = getProbability(CM_C);
  123.  
  124.     inv_f = 1 - f_test_S;
  125.     p_S_ABC = (p_A_S * p_B_S * p_C_S * f_test_S) / (p_A_S * p_B_S * p_C_S * f_test_S + p_A_notS * p_B_notS * p_C_notS * inv_f);
  126.     p_notS_ABC = (p_A_notS * p_B_notS * p_C_notS * inv_f) / (p_A_notS * p_B_notS * p_C_notS * inv_f + p_A_S * p_B_S * p_C_S * f_test_S);
  127. endfunction
  128.  
  129. % ==============================================================================
  130. % 4.0.b) Initialization, convenience variables
  131. % ==============================================================================
  132.  
  133. fileName = 'SpamFilterDataNum.csv';
  134. T = csvread(fileName, 1, 0);
  135.  
  136. % Cross Validation Partition: training set 0.8 and test set 0.2
  137. I = cvpartition(T(:, 7), 'HoldOut', 0.2); % training(I, 2), test(I, 2)
  138.  
  139. % ==============================================================================
  140. % 4.1.a) Histograms
  141. % ==============================================================================
  142. printf('\n============================ 4.1.a ============================\n\n')
  143.  
  144. printf('Printing histograms...\n')
  145. multiplotHist('"will": Spam',       3, 4, 1,  T, 1, 1);
  146. multiplotHist('"will": Non-Spam',   3, 4, 2,  T, 1, 0);
  147. multiplotHist('"remove": Spam',     3, 4, 3,  T, 2, 1);
  148. multiplotHist('"remove": Non-Spam', 3, 4, 4,  T, 2, 0);
  149. multiplotHist('"you": Spam',        3, 4, 5,  T, 3, 1);
  150. multiplotHist('"you": Non-Spam',    3, 4, 6,  T, 3, 0);
  151. multiplotHist('"free": Spam',       3, 4, 7,  T, 4, 1);
  152. multiplotHist('"free": Non-Spam',   3, 4, 8,  T, 4, 0);
  153. multiplotHist('"!": Spam',          3, 4, 9,  T, 5, 1);
  154. multiplotHist('"!": Non-Spam',      3, 4, 10, T, 5, 0);
  155. multiplotHist('"$": Spam',          3, 4, 11, T, 6, 1);
  156. multiplotHist('"$": Non-Spam',      3, 4, 12, T, 6, 0);
  157. printf('Done.\n')
  158.  
  159. % ==============================================================================
  160. % 4.1.b) Estimations for p(E|S), p(E|¬S), p(S|E)
  161. % ==============================================================================
  162. printf('\n============================ 4.1.b ============================\n\n')
  163.  
  164. % Get Confusion Matrix for "!"
  165. %
  166. col = 5;      % = charExclamation
  167. thrshld = 0;  % more than 0 of ..
  168. CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld);
  169.  
  170. printf('\n*** Probabilities with estimation factors (0.9, 0.1):\n\n')
  171.  
  172. [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
  173.  
  174. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  175. %
  176. % - The estimations (around 96%) do not reach the threshold value of 98%
  177. %
  178. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  179.  
  180. % ==============================================================================
  181. % 4.1.c) Confusion Matrix of Training Data
  182. % ==============================================================================
  183. printf('\n============================ 4.1.c ============================\n\n')
  184.  
  185. % Get Confusion Matrix for "!"
  186. %
  187. col = 5;      % = charExclamation
  188. thrshld = 0;  % more than 0 of ..
  189. CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
  190.  
  191. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  192. %                      
  193. % Example CM from this run:
  194. %                      
  195. % CM =
  196. %                      
  197. %   E_S    = 1221      E_notS =  584
  198. %   notE_S = 229    notE_notS = 1647
  199. %
  200. % What is Expected:
  201. %
  202. % - The Values E_S and notE_notS should be as big as possible
  203. % - The Values E_notS and notE_S should be as small as possible
  204. %
  205. % What is given:
  206. %
  207. % - Number of tries in training set: 1221 + 584 + 229 + 1647 = 3681
  208. % - The ratio of false positives is too high: (584 / 3681) * 100 = 15.86%
  209. % - The ratio of false negatives is also high: (229 / 3681) * 100 = 6.21%
  210. %
  211. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  212.  
  213. % ==============================================================================
  214. % 4.1.d) Changing Threshold
  215. % ==============================================================================
  216. printf('\n============================ 4.1.d ============================\n\n')
  217.  
  218. thrshld = 0.1;
  219. while thrshld <= 3.2
  220.     printf('\n*** CM and Probabilities for threshold %d:\n\n',thrshld)
  221.     CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
  222.     [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
  223.     thrshld = thrshld * 2;
  224. endwhile
  225.  
  226. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  227. %
  228. %  CM = | E_S      E_notS    |
  229. %       | notE_S   notE_notS |
  230. %
  231. % Threshold 0.0: ... Threshold 0.8: ...  Threshold 3.6:
  232. %
  233. %   1210    604         288     42          25      7
  234. %    241   1626        1163   2188        1426   2223
  235. %
  236. % => thshld++ -> E_S--, E_notS--, notE_S++, notE_notS+
  237. %
  238. % With increasing Threshold:
  239. %
  240. % - Less E-Mails are classified as Spam or not spam (the number of false
  241. %   positives and false negatives decreases)
  242. % - The number of (true and false) negatives increases as there are less
  243. %   occurences where E is identified
  244. % > We decide to use a threshold of 0.2 for the next exercise
  245. %
  246. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  247.  
  248.  
  249. % ==============================================================================
  250. % 4.1.e) Test Criterias
  251. % ==============================================================================
  252. printf('\n============================ 4.1.e ============================\n\n')
  253.  
  254. printf('\n*** Training Data for threshold 0.2:\n\n')
  255. thrshld = 0.2;
  256. CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
  257. [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
  258.  
  259. printf('\n*** Test Data for threshold 0.2:\n\n')
  260. CM = getCMSingleAttribute(T, test(I, 2), 5, thrshld)
  261. [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
  262.  
  263. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  264. %
  265. %  Training Data:               Test Data:
  266. %
  267. %    926    263                   212    68
  268. %    525   1967           =>      150   490
  269. %
  270. %  S        = 1451              S        = 362
  271. %  notS     = 2230              notS     = 558
  272. %  p_E_S    = 0.63818           p_E_S    = 0.58564
  273. %  p_E_notS = 0.11794           p_E_notS = 0.12186
  274. %  p_S_E    = 0.97988           p_S_E    = 0.97740
  275. %
  276. % Comparing Test Data to Training Data:
  277. %
  278. % - As we can see there is no significant difference between the training
  279. %   results and the test results, the resulting bayes probabilities are
  280. %   much alike.
  281. %
  282. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  283.  
  284.  
  285. % ==============================================================================
  286. % 4.1.f) Changing estimation factors
  287. % ==============================================================================
  288. printf('\n============================ 4.1.f ============================\n\n')
  289.  
  290. printf('\n*** Probabilities with estimation factors (0.5, 0.5):\n\n')
  291.  
  292. thrshld = 0.2;
  293. f_test_S = 0.5; % estimation that Mail is Spam
  294. CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
  295. [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)
  296.  
  297. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  298. %
  299. % With a lower estimation that the mail is spam:
  300. %
  301. % - The Spamfilter acts more "conservative" as the bayes probability that the
  302. %   given event is spam is reduced (p_S_E = 0.84 instead of 0.97). Therefore
  303. %   less false positives are produced for this criteria.
  304. %
  305. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  306.  
  307. % ==============================================================================
  308. % 4.1.g) My Own Spamfilter
  309. % ==============================================================================
  310. printf('\n============================ 4.1.g ============================\n\n')
  311.  
  312. thrshld = 0;    % Event Threshold
  313. f_test_S = 0.5; % Estimation that Mail is Spam (for bayes)
  314.  
  315. %  Columns:
  316. %
  317. % "will","remove","you","free","charExclamation","charDollar","type"
  318. %    1      2       3      4        5                 6          7
  319. %
  320. CM_will   = getCMSingleAttribute(T, training(I, 2), 1, thrshld)
  321. CM_remove = getCMSingleAttribute(T, training(I, 2), 2, thrshld)
  322. CM_you    = getCMSingleAttribute(T, training(I, 2), 3, thrshld)
  323. CM_free   = getCMSingleAttribute(T, training(I, 2), 4, thrshld)
  324. CM_chEx   = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
  325. CM_chD    = getCMSingleAttribute(T, training(I, 2), 6, thrshld)
  326.  
  327. CMs     = {CM_will, CM_remove, CM_you, CM_free, CM_chEx, CM_chD};
  328. CMnames = {'will', 'remove', 'you', 'free', '!', '$'};
  329.  
  330. % Let's get an overview of the BiEvent candidates:
  331. for i = 1:6
  332.     for j = 1:6
  333.         if (j <= i) continue; endif;
  334.         printf('\n*** Bayes probabilities for "%s" and "%s":\n\n', CMnames{i}, CMnames{j})
  335.         [p_S_E, p_notS_E] = getBiEventBayesProbability(CMs{i}, CMs{j}, f_test_S)
  336.        
  337.         % Brute Force through CMs:
  338.         % CM = getCMBiAttribute(T, training(I, 2), i, j, thrshld)
  339.     endfor
  340. endfor
  341.  
  342. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  343. %
  344. % Following are the most interesting (bi-event) cantidates:
  345. %
  346. % "free" and "remove" => 2 and 4
  347. CM_free_remove   = getCMBiAttribute(T, training(I, 2), 2, 4, thrshld)
  348. % => CM_free_remove =  369      8
  349. %                      431   2015
  350. %
  351. %
  352. % $ and "free" => 4 and 6
  353. CM_dollar_free = getCMBiAttribute(T, training(I, 2), 4, 6, thrshld)
  354. % => CM_dollar_free =  524     61
  355. %                      302   1851
  356. %
  357. %
  358. % $ and "remove" => 2 and 6
  359. CM_dollar_remove = getCMBiAttribute(T, training(I, 2), 2, 6, thrshld)
  360. % => CM_dollar_remove =  407     12
  361. %                        368   1984
  362. %
  363. %
  364. % $ and ! => 5 and 6
  365. CM_dollar_exclam = getCMBiAttribute(T, training(I, 2), 5, 6, thrshld)
  366. % CM_dollar_exclam =  778     79
  367. %                     144   1476
  368. %
  369. % Analysis of these candidates:
  370. %
  371. % - The Candidates are all better than the single event Spam filters
  372. % - Still they either have a too small amount of true positives
  373. %   (like "free" & "remove" or "$" & "remove") or a too high amount of
  374. %   false positives (like "$" & "!" or "$" & "free").
  375. % - We go on with tri-attributes
  376. %
  377. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
  378.  
  379. % We use an automated evaluation mechanism as follows:
  380. %
  381. % - Founds (E_S and notE_notS) count as 1 Point
  382. % - False positives (E_notS) count as -n^2 Points (squared negative)
  383. % - False negatives (notE_S) count as -1 Point
  384. maxVal = 0;
  385. a = 0; b = 0; c = 0;
  386.  
  387. % Let's get an overview of the TriEvent candidates:
  388. for i = 1:6
  389.     for j = 1:6
  390.         if (j <= i) continue; endif;
  391.        
  392.         for k = 1:6
  393.             if (k <= j) continue; endif;
  394.             printf('\n*** Bayes probabilities for "%s", "%s" and "%s":\n\n', CMnames{i}, CMnames{j}, CMnames{k})
  395.             [p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{i}, CMs{j}, CMs{k}, f_test_S)
  396.            
  397.             % Brute Force through CMs:
  398.             CM = getCMTriAttribute(T, training(I, 2), i, j, k, thrshld)
  399.             curMax = CM(1,1) + CM(2, 2) - CM(1,2)^2 - CM(2, 1)
  400.             if (curMax > maxVal)
  401.                 a = i; b = j; c = k,
  402.                 maxVal = curMax;
  403.             endif
  404.         endfor
  405.     endfor
  406. endfor
  407.  
  408. printf('\n*** Best Evaluated Filter: "%s", "%s" and "%s":\n\n', CMnames{a}, CMnames{b}, CMnames{c})
  409. [p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{a}, CMs{b}, CMs{c}, f_test_S)
  410.  
  411. printf('\n*** Training results for given Filter:\n\n')
  412. CM = getCMTriAttribute(T, training(I, 2), a, b, c, thrshld)
  413.  
  414. printf('\n*** Test results for given Filter:\n\n')
  415. CM = getCMTriAttribute(T, test(I, 2), a, b, c, thrshld)
  416.  
  417. % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
  418. %
  419. % Best Evaluated Filter: "remove", "free" and "$":
  420. %
  421. % p_S_E =  0.99900
  422. % p_notS_E =  0.0010011
  423. %
  424. % Training results for given Filter:
  425. %
  426. % CM =
  427. %
  428. %     272      7
  429. %     210   1841
  430. %
  431. %
  432. % Test results for given Filter:
  433. %
  434. % CM =
  435. %
  436. %     59     1
  437. %     43   455
  438. %
  439. % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement