SO weekly data comparison

%% SO SQL query

% WITH a AS (
%
% SELECT
% u.id AS user_id
% , COUNT(DISTINCT p.id) AS posts
% FROM Users u
% LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2023-04-08' AND p.CreationDate<'2023-04-15')
% GROUP BY u.id
% )
% , total AS (
% SELECT COUNT(DISTINCT u.id) AS users
% FROM Users u
% )
% SELECT
% COUNT(DISTINCT user_id) AS users
% , posts
%
%
% FROM a
% GROUP BY posts
%
%
%
% WITH a AS (
%
% SELECT
% u.id AS user_id
% , COUNT(DISTINCT p.id) AS posts
% FROM Users u
% LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2022-11-12' AND p.CreationDate<'2022-11-19')
% GROUP BY u.id
% )
% , total AS (
% SELECT COUNT(DISTINCT u.id) AS users
% FROM Users u
% )
% SELECT
% COUNT(DISTINCT user_id) AS users
% , posts
%
%
% FROM a
% GROUP BY posts

%% Read and sort the CSVs; column "posts" is the number of posts, "users" is a count of users
d1 = readtable('NovAnswerers.csv');
d2 = readtable('AprilAnswerers.csv');

d1 = sortrows(d1,'posts');
d2 = sortrows(d2,'posts');
d1 = d1(d1.posts>0,:);
d2 = d2(d2.posts>0,:);


%% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
sum(d1.posts.*d1.users)
sum(d2.posts.*d2.users)

%% Number of posters with >=3 answers
sum(d1.users(d1.posts>=3))
sum(d2.users(d2.posts>=3))

%% Fraction of answers by >=3 posters
sum(d1.posts(d1.posts>=3).*d1.users(d1.posts>=3)) / sum(d1.posts.*d1.users)
sum(d2.posts(d2.posts>=3).*d2.users(d2.posts>=3)) / sum(d2.posts.*d2.users)

% Simulation: Are these data consistent with just a general reduction in
% answer rate?

answerRatio = sum(d2.posts.*d2.users)/sum(d1.posts.*d1.users);


nRep = 1000;
totalAnswers = zeros(nRep,1);
numTopAnswerers = zeros(nRep,1);
nAnswersByTop = zeros(nRep,1);
tic
for n = 1:nRep
    simData = cell(height(d1),1);
    for ii=1:height(d1)
        % For each row in d1, we have N users who each have K posts. So, we
        % generate a N-by-K uniform random distribution, threshold at the
        % answerRatio, and sum across columns to get the new "posts per user"
        R = rand(d1.users(ii),d1.posts(ii));
        postsPerUser = sum(R<=answerRatio,2);
        simData{ii} = postsPerUser;
    end

    % We have a cell array here of N posts for each of the original counts, but
    % we really just want to bundle the users together and tabulate to get data
    % in the format we originally had.
    simData = cat(1,simData{:});
    t = tabulate(simData);
    users = t(:,2);
    posts = t(:,1);
    dsim = table(users,posts);

    %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
    totalAnswers(n) = sum(dsim.posts.*dsim.users);

    %% Number of posters with >=3 answers
    numTopAnswerers(n) = sum(dsim.users(dsim.posts>=3));

    %% Fraction of answers by >=3 posters
    nAnswersByTop(n) = sum(dsim.posts(dsim.posts>=3).*dsim.users(dsim.posts>=3)) / sum(dsim.posts.*dsim.users);
end
toc

%% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
mean(totalAnswers)

%% Number of posters with >=3 answers
mean(numTopAnswerers)

%% Fraction of answers by >=3 posters
mean(nAnswersByTop)

%% Plots; this is just based on a single simulation rather than an average across sims

figure()
subplot(1,2,1);
hold on;
plot([d2.posts(d2.posts<=20); 25],[d2.users(d2.posts<=20); sum(d2.users(d2.posts>20))],'ob')

plot([dsim.posts(dsim.posts<=20); 25],[dsim.users(dsim.posts<=20); sum(dsim.users(dsim.posts>20))],'xr')

set(gca,'XTick',[1 3 10 15 20 25],'XTickLabel',{'1','3','10','15','20','>20'})
xlabel('# answers per user')
ylabel('N users with X answers')

legend({'April Actual','April Expected (simulation)'})
title('All')


subplot(1,2,2);
hold on;
plot([d2.posts(d2.posts<=20 & d2.posts>=3); 25],[d2.users(d2.posts<=20 & d2.posts>=3); sum(d2.users(d2.posts>20))],'ob')

plot([dsim.posts(dsim.posts<=20 & dsim.posts>=3); 25],[dsim.users(dsim.posts<=20 & dsim.posts>=3); sum(dsim.users(dsim.posts>20))],'xr')

set(gca,'XTick',[3 10 15 20 25],'XTickLabel',{'3','10','15','20','>20'})
xlabel('# answers per user')
ylabel('N users with X answers')

legend({'April Actual','April Expected (simulation)'})
title('>=3 only, different y-scale')