SpamFilter

% Spamfilter Octave 10.03.18
% Vidushi Maillart, Marx Stampfli, Reto Spöhel
%
% Student: Aleistar Markóczy
%
% ==============================================================================
% Function Reference
%
% - subplot(<rows>, <cols>, <index>): Display multiple plots in single figure
% - hist(<input vector>, <bins>, .. <attr>, <value>)
%     <bins> = [0:0.4:8] means "for x=0; x<=8; x+=0.4 { append(bins, x) }"
% - csvread(<file>, <startY>, <startX>): Read csv file
% - cvpartition(<vector>, <part_type>, <part_params>): Create partition
%     e.g. Type = "HoldOut": Training set and Test set (param = size test)
% - Create Vector from matrix: M(:, <col>) => Vector at <col>
%
%  Columns:
%
% "will","remove","you","free","charExclamation","charDollar","type"
%    1      2       3      4        5                 6          7
%
%
% ==============================================================================
%
% Encoding: UTF-8
pkg load statistics


% ==============================================================================
% 4.0.a) Required Functions
% ==============================================================================

% Histogram multiplot
function multiplotHist(hist_title, sizeY, sizeX, idx, data, col, spam)
    if (spam)
    subplot(sizeY, sizeX, idx)
        hist(data(data(:,7)==1, col), [0:0.4:8], 'facecolor', 'r')
        title(hist_title)
    else
    subplot(sizeY, sizeX, idx)
        hist(data(data(:,7)==0, col), [0:0.4:8], 'facecolor', 'b')
        title(hist_title)
    endif
endfunction

% Single Attribute A{word, threshold} Confusion Matrix
function CM = getCMSingleAttribute(data, row_select, column, thrshld)
    % Group and sizes of selected sets
    S         = sum(data(row_select, 7) == 1);
    notS      = sum(data(row_select, 7) == 0);
    E_S       = sum(data(row_select, column) > thrshld & data(row_select, 7) == 1);
    E_notS    = sum(data(row_select, column) > thrshld & data(row_select, 7) == 0);
    notE_S    = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 1);
    notE_notS = sum(data(row_select, column) <= thrshld & data(row_select, 7) == 0);
    % Confusion Matrix
    CM        = [E_S, E_notS; notE_S, notE_notS];
endfunction

% Double Attribute A{word, threshold} Confusion Matrix
function CM = getCMBiAttribute(data, row_select, column1, column2, thrshld)
    % Group and sizes of selected sets
    S         = sum(data(row_select, 7) == 1);
    notS      = sum(data(row_select, 7) == 0);
    E_S       = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 1);
    E_notS    = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, 7) == 0);
    notE_S    = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 1);
    notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, 7) == 0);
    % Confusion Matrix
    CM        = [E_S, E_notS; notE_S, notE_notS];
endfunction

% Triple Attribute A{word, threshold} Confusion Matrix
function CM = getCMTriAttribute(data, row_select, column1, column2, column3, thrshld)
    % Group and sizes of selected sets
    S         = sum(data(row_select, 7) == 1);
    notS      = sum(data(row_select, 7) == 0);
    E_S       = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 1);
    E_notS    = sum(data(row_select, column1) > thrshld & data(row_select, column2) > thrshld & data(row_select, column3) > thrshld & data(row_select, 7) == 0);
    notE_S    = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 1);
    notE_notS = sum(data(row_select, column1) <= thrshld & data(row_select, column2) <= thrshld & data(row_select, column3) <= thrshld & data(row_select, 7) == 0);
    % Confusion Matrix
    CM        = [E_S, E_notS; notE_S, notE_notS];
endfunction

% Probabilities from a Confusion matrix
function [p_E_S, p_E_notS] = getProbability(CM)
    % Extract CM Variables
    E_S = CM(1, 1);
    E_notS = CM(1, 2);
    notE_S = CM(2, 1);
    notE_notS = CM(2, 2);
    % Probability estimations
    S = E_S + notE_S;
    notS = E_notS + notE_notS;
    p_E_S = E_S / S;
    p_E_notS = E_notS / notS;
endfunction

% Bayes Probability from a Confusion matrix and estimation factor
function [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)
    % Get raw probabilities
    [p_E_S, p_E_notS] = getProbability(CM);
    % Calc bayes probability with estimation factor f_test_E
    inv_f = 1 - f_test_S;
    p_S_E = (p_E_S * f_test_S) / (p_E_S * f_test_S + p_E_notS * inv_f);
endfunction

function [p_S_AB, p_notS_AB] = getBiEventBayesProbability(CM_A, CM_B, f_test_S)

    [p_A_S, p_A_notS] = getProbability(CM_A);
    [p_B_S, p_B_notS] = getProbability(CM_B);

    inv_f = 1 - f_test_S;
    p_S_AB = (p_A_S * p_B_S * f_test_S) / (p_A_S * p_B_S * f_test_S + p_A_notS * p_B_notS * inv_f);
    p_notS_AB = (p_A_notS * p_B_notS * inv_f) / (p_A_notS * p_B_notS * inv_f + p_A_S * p_B_S * f_test_S);
endfunction

function [p_S_ABC, p_notS_ABC] = getTriEventBayesProbability(CM_A, CM_B, CM_C, f_test_S)

    [p_A_S, p_A_notS] = getProbability(CM_A);
    [p_B_S, p_B_notS] = getProbability(CM_B);
    [p_C_S, p_C_notS] = getProbability(CM_C);

    inv_f = 1 - f_test_S;
    p_S_ABC = (p_A_S * p_B_S * p_C_S * f_test_S) / (p_A_S * p_B_S * p_C_S * f_test_S + p_A_notS * p_B_notS * p_C_notS * inv_f);
    p_notS_ABC = (p_A_notS * p_B_notS * p_C_notS * inv_f) / (p_A_notS * p_B_notS * p_C_notS * inv_f + p_A_S * p_B_S * p_C_S * f_test_S);
endfunction

% ==============================================================================
% 4.0.b) Initialization, convenience variables
% ==============================================================================

fileName = 'SpamFilterDataNum.csv';
T = csvread(fileName, 1, 0);

% Cross Validation Partition: training set 0.8 and test set 0.2
I = cvpartition(T(:, 7), 'HoldOut', 0.2); % training(I, 2), test(I, 2)

% ==============================================================================
% 4.1.a) Histograms
% ==============================================================================
printf('\n============================ 4.1.a ============================\n\n')

printf('Printing histograms...\n')
multiplotHist('"will": Spam',       3, 4, 1,  T, 1, 1);
multiplotHist('"will": Non-Spam',   3, 4, 2,  T, 1, 0);
multiplotHist('"remove": Spam',     3, 4, 3,  T, 2, 1);
multiplotHist('"remove": Non-Spam', 3, 4, 4,  T, 2, 0);
multiplotHist('"you": Spam',        3, 4, 5,  T, 3, 1);
multiplotHist('"you": Non-Spam',    3, 4, 6,  T, 3, 0);
multiplotHist('"free": Spam',       3, 4, 7,  T, 4, 1);
multiplotHist('"free": Non-Spam',   3, 4, 8,  T, 4, 0);
multiplotHist('"!": Spam',          3, 4, 9,  T, 5, 1);
multiplotHist('"!": Non-Spam',      3, 4, 10, T, 5, 0);
multiplotHist('"$": Spam',          3, 4, 11, T, 6, 1);
multiplotHist('"$": Non-Spam',      3, 4, 12, T, 6, 0);
printf('Done.\n')

% ==============================================================================
% 4.1.b) Estimations for p(E|S), p(E|¬S), p(S|E)
% ==============================================================================
printf('\n============================ 4.1.b ============================\n\n')

% Get Confusion Matrix for "!"
%
col = 5;      % = charExclamation
thrshld = 0;  % more than 0 of ..
CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld);

printf('\n*** Probabilities with estimation factors (0.9, 0.1):\n\n')

[p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
% - The estimations (around 96%) do not reach the threshold value of 98%
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **

% ==============================================================================
% 4.1.c) Confusion Matrix of Training Data
% ==============================================================================
printf('\n============================ 4.1.c ============================\n\n')

% Get Confusion Matrix for "!"
%
col = 5;      % = charExclamation
thrshld = 0;  % more than 0 of ..
CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
% Example CM from this run:
%
% CM =
%
%   E_S    = 1221      E_notS =  584
%   notE_S = 229    notE_notS = 1647
%
% What is Expected:
%
% - The Values E_S and notE_notS should be as big as possible
% - The Values E_notS and notE_S should be as small as possible
%
% What is given:
%
% - Number of tries in training set: 1221 + 584 + 229 + 1647 = 3681
% - The ratio of false positives is too high: (584 / 3681) * 100 = 15.86%
% - The ratio of false negatives is also high: (229 / 3681) * 100 = 6.21%
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **

% ==============================================================================
% 4.1.d) Changing Threshold
% ==============================================================================
printf('\n============================ 4.1.d ============================\n\n')

thrshld = 0.1;
while thrshld <= 3.2
    printf('\n*** CM and Probabilities for threshold %d:\n\n',thrshld)
    CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
    [p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)
    thrshld = thrshld * 2;
endwhile

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
%  CM = | E_S      E_notS    |
%       | notE_S   notE_notS |
%
% Threshold 0.0: ... Threshold 0.8: ...  Threshold 3.6:
%
%   1210    604         288     42          25      7
%    241   1626        1163   2188        1426   2223
%
% => thshld++ -> E_S--, E_notS--, notE_S++, notE_notS+
%
% With increasing Threshold:
%
% - Less E-Mails are classified as Spam or not spam (the number of false
%   positives and false negatives decreases)
% - The number of (true and false) negatives increases as there are less
%   occurences where E is identified
% > We decide to use a threshold of 0.2 for the next exercise
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **


% ==============================================================================
% 4.1.e) Test Criterias
% ==============================================================================
printf('\n============================ 4.1.e ============================\n\n')

printf('\n*** Training Data for threshold 0.2:\n\n')
thrshld = 0.2;
CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
[p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)

printf('\n*** Test Data for threshold 0.2:\n\n')
CM = getCMSingleAttribute(T, test(I, 2), 5, thrshld)
[p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, 0.9)

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
%  Training Data:               Test Data:
%
%    926    263                   212    68
%    525   1967           =>      150   490
%
%  S        = 1451              S        = 362
%  notS     = 2230              notS     = 558
%  p_E_S    = 0.63818           p_E_S    = 0.58564
%  p_E_notS = 0.11794           p_E_notS = 0.12186
%  p_S_E    = 0.97988           p_S_E    = 0.97740
%
% Comparing Test Data to Training Data:
%
% - As we can see there is no significant difference between the training
%   results and the test results, the resulting bayes probabilities are
%   much alike.
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **


% ==============================================================================
% 4.1.f) Changing estimation factors
% ==============================================================================
printf('\n============================ 4.1.f ============================\n\n')

printf('\n*** Probabilities with estimation factors (0.5, 0.5):\n\n')

thrshld = 0.2;
f_test_S = 0.5; % estimation that Mail is Spam
CM = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
[p_S_E, p_E_S, p_E_notS] = getBayesProbability(CM, f_test_S)

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
% With a lower estimation that the mail is spam:
%
% - The Spamfilter acts more "conservative" as the bayes probability that the
%   given event is spam is reduced (p_S_E = 0.84 instead of 0.97). Therefore
%   less false positives are produced for this criteria.
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **

% ==============================================================================
% 4.1.g) My Own Spamfilter
% ==============================================================================
printf('\n============================ 4.1.g ============================\n\n')

thrshld = 0;    % Event Threshold
f_test_S = 0.5; % Estimation that Mail is Spam (for bayes)

%  Columns:
%
% "will","remove","you","free","charExclamation","charDollar","type"
%    1      2       3      4        5                 6          7
%
CM_will   = getCMSingleAttribute(T, training(I, 2), 1, thrshld)
CM_remove = getCMSingleAttribute(T, training(I, 2), 2, thrshld)
CM_you    = getCMSingleAttribute(T, training(I, 2), 3, thrshld)
CM_free   = getCMSingleAttribute(T, training(I, 2), 4, thrshld)
CM_chEx   = getCMSingleAttribute(T, training(I, 2), 5, thrshld)
CM_chD    = getCMSingleAttribute(T, training(I, 2), 6, thrshld)

CMs     = {CM_will, CM_remove, CM_you, CM_free, CM_chEx, CM_chD};
CMnames = {'will', 'remove', 'you', 'free', '!', '$'};

% Let's get an overview of the BiEvent candidates:
for i = 1:6
    for j = 1:6
        if (j <= i) continue; endif;
        printf('\n*** Bayes probabilities for "%s" and "%s":\n\n', CMnames{i}, CMnames{j})
        [p_S_E, p_notS_E] = getBiEventBayesProbability(CMs{i}, CMs{j}, f_test_S)

        % Brute Force through CMs:
        % CM = getCMBiAttribute(T, training(I, 2), i, j, thrshld)
    endfor
endfor

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
% Following are the most interesting (bi-event) cantidates:
%
% "free" and "remove" => 2 and 4
CM_free_remove   = getCMBiAttribute(T, training(I, 2), 2, 4, thrshld)
% => CM_free_remove =  369      8
%                      431   2015
%
%
% $ and "free" => 4 and 6
CM_dollar_free = getCMBiAttribute(T, training(I, 2), 4, 6, thrshld)
% => CM_dollar_free =  524     61
%                      302   1851
%
%
% $ and "remove" => 2 and 6
CM_dollar_remove = getCMBiAttribute(T, training(I, 2), 2, 6, thrshld)
% => CM_dollar_remove =  407     12
%                        368   1984
%
%
% $ and ! => 5 and 6
CM_dollar_exclam = getCMBiAttribute(T, training(I, 2), 5, 6, thrshld)
% CM_dollar_exclam =  778     79
%                     144   1476
%
% Analysis of these candidates:
%
% - The Candidates are all better than the single event Spam filters
% - Still they either have a too small amount of true positives
%   (like "free" & "remove" or "$" & "remove") or a too high amount of
%   false positives (like "$" & "!" or "$" & "free").
% - We go on with tri-attributes
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **

% We use an automated evaluation mechanism as follows:
%
% - Founds (E_S and notE_notS) count as 1 Point
% - False positives (E_notS) count as -n^2 Points (squared negative)
% - False negatives (notE_S) count as -1 Point
maxVal = 0;
a = 0; b = 0; c = 0;

% Let's get an overview of the TriEvent candidates:
for i = 1:6
    for j = 1:6
        if (j <= i) continue; endif;

        for k = 1:6
            if (k <= j) continue; endif;
            printf('\n*** Bayes probabilities for "%s", "%s" and "%s":\n\n', CMnames{i}, CMnames{j}, CMnames{k})
            [p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{i}, CMs{j}, CMs{k}, f_test_S)

            % Brute Force through CMs:
            CM = getCMTriAttribute(T, training(I, 2), i, j, k, thrshld)
            curMax = CM(1,1) + CM(2, 2) - CM(1,2)^2 - CM(2, 1)
            if (curMax > maxVal)
                a = i; b = j; c = k,
                maxVal = curMax;
            endif
        endfor
    endfor
endfor

printf('\n*** Best Evaluated Filter: "%s", "%s" and "%s":\n\n', CMnames{a}, CMnames{b}, CMnames{c})
[p_S_E, p_notS_E] = getTriEventBayesProbability(CMs{a}, CMs{b}, CMs{c}, f_test_S)

printf('\n*** Training results for given Filter:\n\n')
CM = getCMTriAttribute(T, training(I, 2), a, b, c, thrshld)

printf('\n*** Test results for given Filter:\n\n')
CM = getCMTriAttribute(T, test(I, 2), a, b, c, thrshld)

% ** ** ** ** ** ** ** ** ** ** ** Analysis ** ** ** ** ** ** ** ** ** ** ** **
%
% Best Evaluated Filter: "remove", "free" and "$":
%
% p_S_E =  0.99900
% p_notS_E =  0.0010011
%
% Training results for given Filter:
%
% CM =
%
%     272      7
%     210   1841
%
%
% Test results for given Filter:
%
% CM =
%
%     59     1
%     43   455
%
% ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **