Untitled

%% Here we collect  the data and we plot the histogram of both samples
% 161 W (expected)
% 174 M (expected)

W   = [160 184 180 158 173 160 176 168];
M   = [175 162 186 182 174 185 180 179 178];

figure()
hold on
histogram(W,158:4:195)
histogram(M,160:4:195)

%% Calculate the size and the mean of each sample

n_W = length(W);
n_M = length(M);

mean_W  = mean(W);
mean_M  = mean(M);

disp(['The mean of the heights of women in the class is ',...
       num2str(mean_W,4),' cm']);
disp(['The mean of the heights of men in the class is ',...
       num2str(mean_M,4),' cm']);

%% Calculate the estimate of variance of the population

var_W   = var(W);
var_M   = var(M);

%% Calculate the standard error of the mean
% this is the estimate of the standard deviation of the estimate of the mean

sem_W   = sqrt(var_W/n_W);
sem_M   = sqrt(var_M/n_M);

%% Calculate the confidence 95% confidence intervals assuming the CLT

CI_W    = [mean_W - 2*sem_W, mean_W + 2*sem_W];
CI_M    = [mean_M - 2*sem_M, mean_M + 2*sem_M];

disp(['The 95% CI for the mean of the heights of women in the class is (',...
       num2str(CI_W,4),') cm']);
disp(['The 95% CI for the mean of the heights of men in the class is (',...
       num2str(CI_M,4),') cm']);


%% Now we calculate the 95% CIs with bootstrap

N_B = 10000; %number of bootstrap samples

all_means_W = zeros(1,N_B);
all_means_M = zeros(1,N_B);

for k=1:N_B
    W_B = randsample(W,n_W,1);
    M_B = randsample(M,n_M,1);

    mean_W_B = mean(W_B);
    mean_M_B = mean(M_B);

    all_means_W(k) = mean_W_B;
    all_means_M(k) = mean_M_B;
end

% plot the histograms
figure()
histogram(all_means_W,150:2:190)
hold on
histogram(all_means_M,150:2:190)

% compute the CIs using the quantiles of the surrogate means

CI_B_W = [quantile(all_means_W,.025), quantile(all_means_W,.975)];
CI_B_M = [quantile(all_means_M,.025), quantile(all_means_M,.975)];

disp(['The 95% CI for the mean of the heights of women in the class is (',...
       num2str(CI_B_W,4),') cm']);
disp(['The 95% CI for the mean of the heights of men in the class is (',...
       num2str(CI_B_M,4),') cm']);


%% Let's do now the hypothesis testing
% Null hypothesis: there is no difference between the mean of the heights of men
%                  and women
% Alternative hypothesis: there is some difference between the mean of the
%                         heights of mean and women (but we don't say which
%                         direction)
% alpha = 0.05
% test statistic = difference of the mean of the two samples

difference = mean_M - mean_W;
disp(['The observed difference between the means is ',...
       num2str(difference,2),' cm']);

%% We first assume that we can apply CLT
% this allows us to calculate directly the distribution of the statistic under
% te null hypothesis

% we calculate the standard deviation of the null as the square root of the sum
% of the standard error of the mean of the two samples
std_CRd     = sqrt(sem_W^2 + sem_M^2);

% the right boundary of the Critical Region at alpha=0.05 corresponds to two
% times the standard deviation (because of the 68-95-99.7 rule)
CR_right_bound  = 2*std_CRd;

disp(['The boundaries of the Critical Region are (',...
       num2str(-CR_right_bound,4),',',num2str(CR_right_bound,4),') cm']);
disp('Therefore we fail to reject the null hypothesis at the 0.05 significance level')

%% Let's calculate the p-value

p_value = 2*(1 - normcdf(difference/std_CRd));
disp(['the p-value is ',num2str(p_value,2),' (>0.05)'])

%% Now let's not assume that we can apply CLT, instead let's do a shuffle test

N_s = 10000; %number of permutations

all_diffs_means = zeros(1,N_s);

% we merge both samples
Y = [W M];

for k=1:N_s
    Y_s = randsample(Y,n_W+n_M,0); % permute the order of the heights
    W_s = Y_s(1:n_W);
    M_s = Y_s(n_W+1:end);          % keep the size of the two original samples

    mean_W_s = mean(W_s);
    mean_M_s = mean(M_s);

    all_diffs_means(k) = mean_M_s - mean_W_s;

end

% plot the histogram
figure()
histogram(all_diffs_means,-15:15)

% calculate the boundaries of the Critical Region
CR_interval = [quantile(all_diffs_means,.025), quantile(all_diffs_means,.975)];

disp(['The boundaries of the Critical Region are (',...
       num2str(CR_interval,2),') cm']);