Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- %% SO SQL query
- % WITH a AS (
- %
- % SELECT
- % u.id AS user_id
- % , COUNT(DISTINCT p.id) AS posts
- % FROM Users u
- % LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2023-04-08' AND p.CreationDate<'2023-04-15')
- % GROUP BY u.id
- % )
- % , total AS (
- % SELECT COUNT(DISTINCT u.id) AS users
- % FROM Users u
- % )
- % SELECT
- % COUNT(DISTINCT user_id) AS users
- % , posts
- %
- %
- % FROM a
- % GROUP BY posts
- %
- %
- %
- % WITH a AS (
- %
- % SELECT
- % u.id AS user_id
- % , COUNT(DISTINCT p.id) AS posts
- % FROM Users u
- % LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2022-11-12' AND p.CreationDate<'2022-11-19')
- % GROUP BY u.id
- % )
- % , total AS (
- % SELECT COUNT(DISTINCT u.id) AS users
- % FROM Users u
- % )
- % SELECT
- % COUNT(DISTINCT user_id) AS users
- % , posts
- %
- %
- % FROM a
- % GROUP BY posts
- %% Read and sort the CSVs; column "posts" is the number of posts, "users" is a count of users
- d1 = readtable('NovAnswerers.csv');
- d2 = readtable('AprilAnswerers.csv');
- d1 = sortrows(d1,'posts');
- d2 = sortrows(d2,'posts');
- d1 = d1(d1.posts>0,:);
- d2 = d2(d2.posts>0,:);
- %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
- sum(d1.posts.*d1.users)
- sum(d2.posts.*d2.users)
- %% Number of posters with >=3 answers
- sum(d1.users(d1.posts>=3))
- sum(d2.users(d2.posts>=3))
- %% Fraction of answers by >=3 posters
- sum(d1.posts(d1.posts>=3).*d1.users(d1.posts>=3)) / sum(d1.posts.*d1.users)
- sum(d2.posts(d2.posts>=3).*d2.users(d2.posts>=3)) / sum(d2.posts.*d2.users)
- % Simulation: Are these data consistent with just a general reduction in
- % answer rate?
- answerRatio = sum(d2.posts.*d2.users)/sum(d1.posts.*d1.users);
- nRep = 1000;
- totalAnswers = zeros(nRep,1);
- numTopAnswerers = zeros(nRep,1);
- nAnswersByTop = zeros(nRep,1);
- tic
- for n = 1:nRep
- simData = cell(height(d1),1);
- for ii=1:height(d1)
- % For each row in d1, we have N users who each have K posts. So, we
- % generate a N-by-K uniform random distribution, threshold at the
- % answerRatio, and sum across columns to get the new "posts per user"
- R = rand(d1.users(ii),d1.posts(ii));
- postsPerUser = sum(R<=answerRatio,2);
- simData{ii} = postsPerUser;
- end
- % We have a cell array here of N posts for each of the original counts, but
- % we really just want to bundle the users together and tabulate to get data
- % in the format we originally had.
- simData = cat(1,simData{:});
- t = tabulate(simData);
- users = t(:,2);
- posts = t(:,1);
- dsim = table(users,posts);
- %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
- totalAnswers(n) = sum(dsim.posts.*dsim.users);
- %% Number of posters with >=3 answers
- numTopAnswerers(n) = sum(dsim.users(dsim.posts>=3));
- %% Fraction of answers by >=3 posters
- nAnswersByTop(n) = sum(dsim.posts(dsim.posts>=3).*dsim.users(dsim.posts>=3)) / sum(dsim.posts.*dsim.users);
- end
- toc
- %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
- mean(totalAnswers)
- %% Number of posters with >=3 answers
- mean(numTopAnswerers)
- %% Fraction of answers by >=3 posters
- mean(nAnswersByTop)
- %% Plots; this is just based on a single simulation rather than an average across sims
- figure()
- subplot(1,2,1);
- hold on;
- plot([d2.posts(d2.posts<=20); 25],[d2.users(d2.posts<=20); sum(d2.users(d2.posts>20))],'ob')
- plot([dsim.posts(dsim.posts<=20); 25],[dsim.users(dsim.posts<=20); sum(dsim.users(dsim.posts>20))],'xr')
- set(gca,'XTick',[1 3 10 15 20 25],'XTickLabel',{'1','3','10','15','20','>20'})
- xlabel('# answers per user')
- ylabel('N users with X answers')
- legend({'April Actual','April Expected (simulation)'})
- title('All')
- subplot(1,2,2);
- hold on;
- plot([d2.posts(d2.posts<=20 & d2.posts>=3); 25],[d2.users(d2.posts<=20 & d2.posts>=3); sum(d2.users(d2.posts>20))],'ob')
- plot([dsim.posts(dsim.posts<=20 & dsim.posts>=3); 25],[dsim.users(dsim.posts<=20 & dsim.posts>=3); sum(dsim.users(dsim.posts>20))],'xr')
- set(gca,'XTick',[3 10 15 20 25],'XTickLabel',{'3','10','15','20','>20'})
- xlabel('# answers per user')
- ylabel('N users with X answers')
- legend({'April Actual','April Expected (simulation)'})
- title('>=3 only, different y-scale')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement