Advertisement
Guest User

SO weekly data comparison

a guest
Jun 8th, 2023
344
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
MatLab 4.34 KB | Source Code | 0 0
  1. %% SO SQL query
  2.  
  3. % WITH a AS (
  4. %
  5. % SELECT
  6. % u.id AS user_id
  7. % , COUNT(DISTINCT p.id) AS posts
  8. % FROM Users u
  9. % LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2023-04-08' AND p.CreationDate<'2023-04-15')
  10. % GROUP BY u.id
  11. % )
  12. % , total AS (
  13. % SELECT COUNT(DISTINCT u.id) AS users
  14. % FROM Users u
  15. % )
  16. % SELECT
  17. % COUNT(DISTINCT user_id) AS users
  18. % , posts
  19. %
  20. %
  21. % FROM a
  22. % GROUP BY posts
  23. %
  24. %
  25. %
  26. % WITH a AS (
  27. %
  28. % SELECT
  29. % u.id AS user_id
  30. % , COUNT(DISTINCT p.id) AS posts
  31. % FROM Users u
  32. % LEFT JOIN Posts p ON p.OwnerUserId = u.id AND p.PostTypeId = 2 AND (p.CreationDate>='2022-11-12' AND p.CreationDate<'2022-11-19')
  33. % GROUP BY u.id
  34. % )
  35. % , total AS (
  36. % SELECT COUNT(DISTINCT u.id) AS users
  37. % FROM Users u
  38. % )
  39. % SELECT
  40. % COUNT(DISTINCT user_id) AS users
  41. % , posts
  42. %
  43. %
  44. % FROM a
  45. % GROUP BY posts
  46.  
  47. %% Read and sort the CSVs; column "posts" is the number of posts, "users" is a count of users
  48. d1 = readtable('NovAnswerers.csv');
  49. d2 = readtable('AprilAnswerers.csv');
  50.  
  51. d1 = sortrows(d1,'posts');
  52. d2 = sortrows(d2,'posts');
  53. d1 = d1(d1.posts>0,:);
  54. d2 = d2(d2.posts>0,:);
  55.  
  56.  
  57. %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
  58. sum(d1.posts.*d1.users)
  59. sum(d2.posts.*d2.users)
  60.  
  61. %% Number of posters with >=3 answers
  62. sum(d1.users(d1.posts>=3))
  63. sum(d2.users(d2.posts>=3))
  64.  
  65. %% Fraction of answers by >=3 posters
  66. sum(d1.posts(d1.posts>=3).*d1.users(d1.posts>=3)) / sum(d1.posts.*d1.users)
  67. sum(d2.posts(d2.posts>=3).*d2.users(d2.posts>=3)) / sum(d2.posts.*d2.users)
  68.  
  69. % Simulation: Are these data consistent with just a general reduction in
  70. % answer rate?
  71.  
  72. answerRatio = sum(d2.posts.*d2.users)/sum(d1.posts.*d1.users);
  73.  
  74.  
  75. nRep = 1000;
  76. totalAnswers = zeros(nRep,1);
  77. numTopAnswerers = zeros(nRep,1);
  78. nAnswersByTop = zeros(nRep,1);
  79. tic
  80. for n = 1:nRep
  81.     simData = cell(height(d1),1);
  82.     for ii=1:height(d1)
  83.         % For each row in d1, we have N users who each have K posts. So, we
  84.         % generate a N-by-K uniform random distribution, threshold at the
  85.         % answerRatio, and sum across columns to get the new "posts per user"
  86.         R = rand(d1.users(ii),d1.posts(ii));
  87.         postsPerUser = sum(R<=answerRatio,2);
  88.         simData{ii} = postsPerUser;
  89.     end
  90.  
  91.     % We have a cell array here of N posts for each of the original counts, but
  92.     % we really just want to bundle the users together and tabulate to get data
  93.     % in the format we originally had.
  94.     simData = cat(1,simData{:});
  95.     t = tabulate(simData);
  96.     users = t(:,2);
  97.     posts = t(:,1);
  98.     dsim = table(users,posts);
  99.  
  100.     %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
  101.     totalAnswers(n) = sum(dsim.posts.*dsim.users);
  102.  
  103.     %% Number of posters with >=3 answers
  104.     numTopAnswerers(n) = sum(dsim.users(dsim.posts>=3));
  105.  
  106.     %% Fraction of answers by >=3 posters
  107.     nAnswersByTop(n) = sum(dsim.posts(dsim.posts>=3).*dsim.users(dsim.posts>=3)) / sum(dsim.posts.*dsim.users);
  108. end
  109. toc
  110.  
  111. %% Number of answers: multiply posts by users and sum up. If 30 people post 10 answers each, that's 300.
  112. mean(totalAnswers)
  113.  
  114. %% Number of posters with >=3 answers
  115. mean(numTopAnswerers)
  116.  
  117. %% Fraction of answers by >=3 posters
  118. mean(nAnswersByTop)
  119.  
  120. %% Plots; this is just based on a single simulation rather than an average across sims
  121.  
  122. figure()
  123. subplot(1,2,1);
  124. hold on;
  125. plot([d2.posts(d2.posts<=20); 25],[d2.users(d2.posts<=20); sum(d2.users(d2.posts>20))],'ob')
  126.  
  127. plot([dsim.posts(dsim.posts<=20); 25],[dsim.users(dsim.posts<=20); sum(dsim.users(dsim.posts>20))],'xr')
  128.  
  129. set(gca,'XTick',[1 3 10 15 20 25],'XTickLabel',{'1','3','10','15','20','>20'})
  130. xlabel('# answers per user')
  131. ylabel('N users with X answers')
  132.  
  133. legend({'April Actual','April Expected (simulation)'})
  134. title('All')
  135.  
  136.  
  137. subplot(1,2,2);
  138. hold on;
  139. plot([d2.posts(d2.posts<=20 & d2.posts>=3); 25],[d2.users(d2.posts<=20 & d2.posts>=3); sum(d2.users(d2.posts>20))],'ob')
  140.  
  141. plot([dsim.posts(dsim.posts<=20 & dsim.posts>=3); 25],[dsim.users(dsim.posts<=20 & dsim.posts>=3); sum(dsim.users(dsim.posts>20))],'xr')
  142.  
  143. set(gca,'XTick',[3 10 15 20 25],'XTickLabel',{'3','10','15','20','>20'})
  144. xlabel('# answers per user')
  145. ylabel('N users with X answers')
  146.  
  147. legend({'April Actual','April Expected (simulation)'})
  148. title('>=3 only, different y-scale')
  149.  
  150.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement