Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- % Spamfilter Octave 10.03.18
- % Vidushi Maillart, Marx Stampfli, Reto Spöhel
- %
- % Student: Aleistar Markóczy
- %
- % ==============================================================================
- % Function Reference
- %
- % - subplot(<rows>, <cols>, <index>): Display multiple plots in single figure
- % - hist(<input vector>, <bins>, .. <attr>, <value>)
- % <bins> = [0:0.4:8] means "for x=0; x<=8; x+=0.4 { append(bins, x) }"
- % - csvread(<file>, <startY>, <startX>): Read csv file
- % - cvpartition(<vector>, <part_type>, <part_params>): Create partition
- % e.g. Type = "HoldOut": Training set and Test set (param = size test)
- % - Create Vector from matrix: M(:, <col>) => Vector at <col>
- %
- % Columns:
- %
- % "will","remove","you","free","charExclamation","charDollar","type"
- % 1 2 3 4 5 6 7
- %
- %
- % ==============================================================================
- %
- % Encoding: UTF-8
- pkg load statistics
- % ==============================================================================
- % 4.0.a) Required Functions
- % ==============================================================================
- % Histogram multiplot
- function multiplotHist(hist_title, sizeY, sizeX, idx, data, col, spam)
- if (spam)
- subplot(sizeY, sizeX, idx)
- hist(data(data(:,7)==1, col), [0:0.4:8], 'facecolor', 'r')
- title(hist_title)
- else
- subplot(sizeY, sizeX, idx)
- hist(data(data(:,7)==0, col), [0:0.4:8], 'facecolor', 'b')
- title(hist_title)
- endif
- endfunction
- % Single Attribute A{word, threshold} Confusion Matrix
- function CM = getCMSingleAttribute(data, row_select, column, thrshld)
- % Group and sizes of selected sets
- S = sum(data(row_select, 7) == 1);
- notS = sum(data(row_select, 7) == 0);
- E_S = sum(data(row_select, column) > thrshld & data(row_select, 7) == 1);
- E_notS = sum(data(row_select, column) > thrshld & data(row_select, 7) == 0);
- notE_S = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 1);
- notE_notS = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 0);
- % Confusion Matrix
- CM = [E_S, E_notS; notE_S, notE_notS];
- endfunction
- % Double Attribute A{word, threshold} Confusion Matrix
- function CM = getCMBiAttribute(data, row_select, column1, column2, thrshld)
- % Group and sizes of selected sets
- S = sum(data(row_select, 7) == 1);
- notS = sum(data(row_select, 7) == 0);
- E_S = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 1);
- E_notS = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 0);
- notE_S = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 1);
- notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 0);
- % Confusion Matrix
- CM = [E_S, E_notS; notE_S, notE_notS];
- endfunction
- % Triple Attribute A{word, threshold} Confusion Matrix
- function CM = getCMTriAttribute(data, row_select, column1, column2, column3, thrshld)
- % Group and sizes of selected sets
- S = sum(data(row_select, 7) == 1);
- notS = sum(data(row_select, 7) == 0);
- E_S = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 1);
- E_notS = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 0);
- notE_S = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 1);
- notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 0);
- % Confusion Matrix
- CM = [E_S, E_notS; notE_S, notE_notS];
- endfunction
- % Probabilities from a Confusion matrix
- function [p_E_S, p_E_notS] = getProbability(CM)
- % Extract CM Variables
- E_S = CM(1, 1);
- E_notS = CM(1, 2);
- notE_S = CM(2, 1);
- notE_notS = CM(2, 2);
- % Probability estimations
- S = E_S + notE_S;
- notS = E_notS + notE_notS;
- p_E_S = E_S / S;
- p_E_notS = E_notS / notS;
- endfunction
- % Bayes Probability from a Confusion matrix and estimation factor
- function [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)
- % Get raw probabilities
- [p_E_S, p_E_notS] = getProbability(CM);
- % Calc bayes probability with estimation factor f_test_E
- inv_f = 1 - f_test_S;
- p_S_E = (p_E_S * f_test_S) / (p_E_S * f_test_S + p_E_notS * inv_f);
- endfunction
- function [p_S_AB, p_notS_AB] = getBiEventBayesProbability(CM_A, CM_B, f_test_S)
- [p_A_S, p_A_notS] = getProbability(CM_A);
- [p_B_S, p_B_notS] = getProbability(CM_B);
- inv_f = 1 - f_test_S;
- p_S_AB = (p_A_S * p_B_S * f_test_S) / (p_A_S * p_B_S * f_test_S + p_A_notS * p_B_notS * inv_f);
- p_notS_AB = (p_A_notS * p_B_notS * inv_f) / (p_A_notS * p_B_notS * inv_f + p_A_S * p_B_S * f_test_S);
- endfunction
- function [p_S_ABC, p_notS_ABC] = getTriEventBayesProbability(CM_A, CM_B, CM_C, f_test_S)
- [p_A_S, p_A_notS] = getProbability(CM_A);
- [p_B_S, p_B_notS] = getProbability(CM_B);
- [p_C_S, p_C_notS] = getProbability(CM_C);
- inv_f = 1 - f_test_S;
- p_S_ABC = (p_A_S * p_B_S * p_C_S * f_test_S) / (p_A_S * p_B_S * p_C_S * f_test_S + p_A_notS * p_B_notS * p_C_notS * inv_f);
- p_notS_ABC = (p_A_notS * p_B_notS * p_C_notS * inv_f) / (p_A_notS * p_B_notS * p_C_notS * inv_f + p_A_S * p_B_S * p_C_S * f_test_S);
- endfunction
- % ==============================================================================
- % 4.0.b) Initialization, convenience variables
- % ==============================================================================
- fileName = 'SpamFilterDataNum.csv';
- T = csvread(fileName, 1, 0);
- % Cross Validation Partition: training set 0.8 and test set 0.2
- I = cvpartition(T(:, 7), 'HoldOut', 0.2); % training(I, 2), test(I, 2)
- % ==============================================================================
- % 4.1.a) Histograms
- % ==============================================================================
- printf('\n============================ 4.1.a ============================\n\n')
- printf('Printing histograms...\n')
- multiplotHist('"will": Spam', 3, 4, 1, T, 1, 1);
- multiplotHist('"will": Non-Spam', 3, 4, 2, T, 1, 0);
- multiplotHist('"remove": Spam', 3, 4, 3, T, 2, 1);
- multiplotHist('"remove": Non-Spam', 3, 4, 4, T, 2, 0);
- multiplotHist('"you": Spam', 3, 4, 5, T, 3, 1);
- multiplotHist('"you": Non-Spam', 3, 4, 6, T, 3, 0);
- multiplotHist('"free": Spam', 3, 4, 7, T, 4, 1);
- multiplotHist('"free": Non-Spam', 3, 4, 8, T, 4, 0);
- multiplotHist('"!": Spam', 3, 4, 9, T, 5, 1);
- multiplotHist('"!": Non-Spam', 3, 4, 10, T, 5, 0);
- multiplotHist('"$": Spam', 3, 4, 11, T, 6, 1);
- multiplotHist('"$": Non-Spam', 3, 4, 12, T, 6, 0);
- printf('Done.\n')
- % ==============================================================================
- % 4.1.b) Estimations for p(E|S), p(E|¬S), p(S|E)
- % ==============================================================================
- printf('\n============================ 4.1.b ============================\n\n')
- % Get Confusion Matrix for "!"
- %
- col = 5; % = charExclamation
- thrshld = 0; % more than 0 of ..
- CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld);
- printf('\n*** Probabilities with estimation factors (0.9, 0.1):\n\n')
- [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % - The estimations (around 96%) do not reach the threshold value of 98%
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % ==============================================================================
- % 4.1.c) Confusion Matrix of Training Data
- % ==============================================================================
- printf('\n============================ 4.1.c ============================\n\n')
- % Get Confusion Matrix for "!"
- %
- col = 5; % = charExclamation
- thrshld = 0; % more than 0 of ..
- CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % Example CM from this run:
- %
- % CM =
- %
- % E_S = 1221 E_notS = 584
- % notE_S = 229 notE_notS = 1647
- %
- % What is Expected:
- %
- % - The Values E_S and notE_notS should be as big as possible
- % - The Values E_notS and notE_S should be as small as possible
- %
- % What is given:
- %
- % - Number of tries in training set: 1221 + 584 + 229 + 1647 = 3681
- % - The ratio of false positives is too high: (584 / 3681) * 100 = 15.86%
- % - The ratio of false negatives is also high: (229 / 3681) * 100 = 6.21%
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % ==============================================================================
- % 4.1.d) Changing Threshold
- % ==============================================================================
- printf('\n============================ 4.1.d ============================\n\n')
- thrshld = 0.1;
- while thrshld <= 3.2
- printf('\n*** CM and Probabilities for threshold %d:\n\n',thrshld)
- CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
- [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
- thrshld = thrshld * 2;
- endwhile
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % CM = | E_S E_notS |
- % | notE_S notE_notS |
- %
- % Threshold 0.0: ... Threshold 0.8: ... Threshold 3.6:
- %
- % 1210 604 288 42 25 7
- % 241 1626 1163 2188 1426 2223
- %
- % => thshld++ -> E_S--, E_notS--, notE_S++, notE_notS+
- %
- % With increasing Threshold:
- %
- % - Less E-Mails are classified as Spam or not spam (the number of false
- % positives and false negatives decreases)
- % - The number of (true and false) negatives increases as there are less
- % occurences where E is identified
- % > We decide to use a threshold of 0.2 for the next exercise
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % ==============================================================================
- % 4.1.e) Test Criterias
- % ==============================================================================
- printf('\n============================ 4.1.e ============================\n\n')
- printf('\n*** Training Data for threshold 0.2:\n\n')
- thrshld = 0.2;
- CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
- [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
- printf('\n*** Test Data for threshold 0.2:\n\n')
- CM = getCMSingleAttribute(T, test(I, 2), 5, thrshld)
- [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % Training Data: Test Data:
- %
- % 926 263 212 68
- % 525 1967 => 150 490
- %
- % S = 1451 S = 362
- % notS = 2230 notS = 558
- % p_E_S = 0.63818 p_E_S = 0.58564
- % p_E_notS = 0.11794 p_E_notS = 0.12186
- % p_S_E = 0.97988 p_S_E = 0.97740
- %
- % Comparing Test Data to Training Data:
- %
- % - As we can see there is no significant difference between the training
- % results and the test results, the resulting bayes probabilities are
- % much alike.
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % ==============================================================================
- % 4.1.f) Changing estimation factors
- % ==============================================================================
- printf('\n============================ 4.1.f ============================\n\n')
- printf('\n*** Probabilities with estimation factors (0.5, 0.5):\n\n')
- thrshld = 0.2;
- f_test_S = 0.5; % estimation that Mail is Spam
- CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
- [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % With a lower estimation that the mail is spam:
- %
- % - The Spamfilter acts more "conservative" as the bayes probability that the
- % given event is spam is reduced (p_S_E = 0.84 instead of 0.97). Therefore
- % less false positives are produced for this criteria.
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % ==============================================================================
- % 4.1.g) My Own Spamfilter
- % ==============================================================================
- printf('\n============================ 4.1.g ============================\n\n')
- thrshld = 0; % Event Threshold
- f_test_S = 0.5; % Estimation that Mail is Spam (for bayes)
- % Columns:
- %
- % "will","remove","you","free","charExclamation","charDollar","type"
- % 1 2 3 4 5 6 7
- %
- CM_will = getCMSingleAttribute(T, training(I, 2), 1, thrshld)
- CM_remove = getCMSingleAttribute(T, training(I, 2), 2, thrshld)
- CM_you = getCMSingleAttribute(T, training(I, 2), 3, thrshld)
- CM_free = getCMSingleAttribute(T, training(I, 2), 4, thrshld)
- CM_chEx = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
- CM_chD = getCMSingleAttribute(T, training(I, 2), 6, thrshld)
- CMs = {CM_will, CM_remove, CM_you, CM_free, CM_chEx, CM_chD};
- CMnames = {'will', 'remove', 'you', 'free', '!', '$'};
- % Let's get an overview of the BiEvent candidates:
- for i = 1:6
- for j = 1:6
- if (j <= i) continue; endif;
- printf('\n*** Bayes probabilities for "%s" and "%s":\n\n', CMnames{i}, CMnames{j})
- [p_S_E, p_notS_E] = getBiEventBayesProbability(CMs{i}, CMs{j}, f_test_S)
- % Brute Force through CMs:
- % CM = getCMBiAttribute(T, training(I, 2), i, j, thrshld)
- endfor
- endfor
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % Following are the most interesting (bi-event) cantidates:
- %
- % "free" and "remove" => 2 and 4
- CM_free_remove = getCMBiAttribute(T, training(I, 2), 2, 4, thrshld)
- % => CM_free_remove = 369 8
- % 431 2015
- %
- %
- % $ and "free" => 4 and 6
- CM_dollar_free = getCMBiAttribute(T, training(I, 2), 4, 6, thrshld)
- % => CM_dollar_free = 524 61
- % 302 1851
- %
- %
- % $ and "remove" => 2 and 6
- CM_dollar_remove = getCMBiAttribute(T, training(I, 2), 2, 6, thrshld)
- % => CM_dollar_remove = 407 12
- % 368 1984
- %
- %
- % $ and ! => 5 and 6
- CM_dollar_exclam = getCMBiAttribute(T, training(I, 2), 5, 6, thrshld)
- % CM_dollar_exclam = 778 79
- % 144 1476
- %
- % Analysis of these candidates:
- %
- % - The Candidates are all better than the single event Spam filters
- % - Still they either have a too small amount of true positives
- % (like "free" & "remove" or "$" & "remove") or a too high amount of
- % false positives (like "$" & "!" or "$" & "free").
- % - We go on with tri-attributes
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
- % We use an automated evaluation mechanism as follows:
- %
- % - Founds (E_S and notE_notS) count as 1 Point
- % - False positives (E_notS) count as -n^2 Points (squared negative)
- % - False negatives (notE_S) count as -1 Point
- maxVal = 0;
- a = 0; b = 0; c = 0;
- % Let's get an overview of the TriEvent candidates:
- for i = 1:6
- for j = 1:6
- if (j <= i) continue; endif;
- for k = 1:6
- if (k <= j) continue; endif;
- printf('\n*** Bayes probabilities for "%s", "%s" and "%s":\n\n', CMnames{i}, CMnames{j}, CMnames{k})
- [p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{i}, CMs{j}, CMs{k}, f_test_S)
- % Brute Force through CMs:
- CM = getCMTriAttribute(T, training(I, 2), i, j, k, thrshld)
- curMax = CM(1,1) + CM(2, 2) - CM(1,2)^2 - CM(2, 1)
- if (curMax > maxVal)
- a = i; b = j; c = k,
- maxVal = curMax;
- endif
- endfor
- endfor
- endfor
- printf('\n*** Best Evaluated Filter: "%s", "%s" and "%s":\n\n', CMnames{a}, CMnames{b}, CMnames{c})
- [p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{a}, CMs{b}, CMs{c}, f_test_S)
- printf('\n*** Training results for given Filter:\n\n')
- CM = getCMTriAttribute(T, training(I, 2), a, b, c, thrshld)
- printf('\n*** Test results for given Filter:\n\n')
- CM = getCMTriAttribute(T, test(I, 2), a, b, c, thrshld)
- % ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
- %
- % Best Evaluated Filter: "remove", "free" and "$":
- %
- % p_S_E = 0.99900
- % p_notS_E = 0.0010011
- %
- % Training results for given Filter:
- %
- % CM =
- %
- % 272 7
- % 210 1841
- %
- %
- % Test results for given Filter:
- %
- % CM =
- %
- % 59 1
- % 43 455
- %
- % ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement