Advertisement
westand

Various WP:MED stats notes

Jan 13th, 2014
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.16 KB | None | 0 0
  1. #############################################################################
  2. #
  3. # Andrew G. West - notes.txt - Notes about the WP:MED stats findings
  4. #
  5. #############################################################################
  6.  
  7. Top ten WP's by views:
  8.  
  9. en English
  10. es Spanish
  11. ru Russian
  12. ja Japanese
  13. de German
  14. fr French
  15. pt Portuguese
  16. it Italian
  17. pl Polish
  18. zh Chinese
  19.  
  20. #############################
  21.  
  22. mysql> SELECT SUBSTRING(PAGE,1,2),COUNT(*),SUM(year) FROM wp_med_monthly
  23. GROUP BY SUBSTRING(PAGE,1,2) ORDER BY COUNT(*) DESC;
  24.  
  25. +---------------------+----------+------------+
  26. | SUBSTRING(PAGE,1,2) | COUNT(*) | SUM(year) |
  27. +---------------------+----------+------------+
  28. | en | 29072 | 2277403987 |
  29. | de | 7761 | 348355902 |
  30. | fr | 6390 | 218532955 |
  31. | es | 6367 | 658556909 |
  32. | pl | 5999 | 147488961 |
  33. | it | 5677 | 147115245 |
  34. | pt | 5269 | 212465002 |
  35. | ru | 4865 | 150162878 |
  36. | nl | 4391 | 75543607 |
  37. | ja | 4303 | 253740601 |
  38. | ar | 4220 | 33228397 |
  39. | sv | 3661 | 30557682 |
  40. | zh | 3426 | 36225111 |
  41. | ca | 2992 | 4764627 |
  42. | fa | 2843 | 17695775 |
  43. | fi | 2825 | 22876930 |
  44. | he | 2389 | 16288412 |
  45. | cs | 2360 | 21955957 |
  46. | uk | 2291 | 11050301 |
  47. | no | 2186 | 12113834 |
  48. | tr | 1822 | 33904861 |
  49. | sr | 1775 | 2665627 |
  50. | ko | 1692 | 6543526 |
  51. | si | 1600 | 4251231 |
  52. | hu | 1528 | 12119560 |
  53. | da | 1433 | 6702468 |
  54. | id | 1431 | 22732356 |
  55. | hr | 1396 | 8679378 |
  56. | th | 1278 | 11120603 |
  57. | bg | 1242 | 7098983 |
  58. | ro | 1225 | 7805024 |
  59. | sl | 1191 | 3063597 |
  60. | gl | 1173 | 614948 |
  61. | lt | 1005 | 4123914 |
  62. | sk | 1000 | 4279143 | [snip!]
  63. 203 rows in set (0.17 sec)
  64.  
  65. #############################
  66.  
  67. mysql> SELECT SUM(year) FROM wp_med_monthly;
  68.  
  69. +------------+
  70. | SUM(year) |
  71. +------------+
  72. | 4,880,133,770 |
  73. +------------+
  74. 1 row in set (0.11 sec)
  75.  
  76. #############################
  77.  
  78. mysql> SELECT PAGE,COUNT(*) FROM wp_med_map m, wp_med_monthly p
  79. WHERE m.en_id=p.id GROUP BY en_id ORDER BY COUNT(*) DESC LIMIT 100;
  80.  
  81. +------------------------------------------------------+----------+
  82. | PAGE | COUNT(*) |
  83. +------------------------------------------------------+----------+
  84. | en Leonardo_da_Vinci | 179 |
  85. | en Human | 164 |
  86. | en Human_biology | 164 |
  87. | en Medicine | 147 |
  88. | en Medical_practice | 147 |
  89. | en Blood | 136 |
  90. | en Fungus | 133 |
  91. | en HIV/AIDS | 127 |
  92. | en Brain_cell | 126 |
  93. | en Brain_function | 126 |
  94. | en Bacteria | 124 |
  95. | en Metastatic_tuberculous_ulcer | 122 |
  96. | en Tuberculosis | 122 |
  97. | en Tubercular_scar | 122 |
  98. | en Louis_Pasteur | 121 |
  99. | en Cancer_patient | 118 |
  100. | en Cancer | 118 |
  101. | en Anatomy | 116 |
  102. | en Mosquito_bite | 114 |
  103. | en World_Health_Organization | 114 |
  104. | en Mosquito | 114 |
  105. | en Diabetes_mellitus | 112 |
  106. | en Disorder_(medicine) | 112 |
  107. | en Disease | 112 |
  108. | en Illness | 112 |
  109. | en Head | 110 |
  110. | en Cerebral_malaria | 110 |
  111. | en Virus | 110 |
  112. | en Malaria | 110 |
  113. | en Bromine | 109 |
  114. | en Death | 109 |
  115. | en Health | 108 |
  116. | en Cholera | 104 |
  117. | en Neck | 102 |
  118. | en Influenza | 102 |
  119. | en Full_term | 100 |
  120. | en Pregnancy | 100 |
  121. | en Biochemistry | 100 |
  122. | en Avicenna | 100 |
  123. | en Angelina_Jolie_cancer_treatment | 97 |
  124. | en Abortion | 96 |
  125. | en Sexual_intercourse | 95 |
  126. | en Hippocrates | 95 |
  127. | en Hospital | 94 |
  128. | en International_Red_Cross_and_Red_Crescent_Movement | 93 |
  129. | en X-Ray | 91 |
  130. | en Genetics | 91 |
  131. | en X-ray | 91 |
  132. | en Suicide | 90 |
  133. | en Alzheimer's_disease | 89 |
  134. | en Pneumonia | 89 |
  135. | en Microbiology | 88 |
  136. | en Lip | 87 |
  137. | en Hormone | 87 |
  138. | en Sleep | 87 |
  139. | en Physiology | 87 |
  140. | en Obesity | 86 |
  141. | en Smallpox | 86 |
  142. | en Variola_major | 86 |
  143. | en Pharmaceutical_drug | 85 |
  144. | en Condom | 85 |
  145. [snip]!
  146.  
  147. #############################
  148.  
  149. mysql> CREATE TEMPORARY TABLE IF NOT EXISTS wp_med_temp AS
  150. (SELECT PAGE,COUNT(*) as A FROM wp_med_map m, wp_med_monthly p
  151. WHERE m.en_id=p.id GROUP BY en_id ORDER BY COUNT(*));
  152.  
  153. mysql> SELECT A,COUNT(*) FROM wp_med_temp GROUP BY A ORDER BY A DESC;
  154. +-----+----------+
  155. | A | COUNT(*) |
  156. +-----+----------+
  157. | 179 | 1 |
  158. | 164 | 2 |
  159. | 147 | 2 |
  160. | 136 | 1 |
  161. | 133 | 1 |
  162. | 127 | 1 |
  163. | 126 | 2 |
  164. | 124 | 1 |
  165. | 122 | 3 |
  166. | 121 | 1 |
  167. | 118 | 2 |
  168. | 116 | 1 |
  169. | 114 | 3 |
  170. | 112 | 4 |
  171. | 110 | 4 |
  172. | 109 | 2 |
  173. | 108 | 1 |
  174. | 104 | 1 |
  175. | 102 | 2 |
  176. | 100 | 4 |
  177. | 97 | 1 |
  178. | 96 | 1 |
  179. | 95 | 2 |
  180. | 94 | 1 |
  181. | 93 | 1 |
  182. | 91 | 3 |
  183. | 90 | 1 |
  184. | 89 | 2 |
  185. | 88 | 1 |
  186. | 87 | 4 |
  187. | 86 | 3 |
  188. | 85 | 2 |
  189. | 84 | 3 |
  190. | 83 | 6 |
  191. | 82 | 3 |
  192. | 81 | 8 |
  193. | 80 | 5 |
  194. | 79 | 5 |
  195. | 78 | 2 |
  196. | 77 | 5 |
  197. | 76 | 7 |
  198. | 75 | 6 |
  199. | 74 | 6 |
  200. | 73 | 6 |
  201. | 72 | 3 |
  202. | 71 | 6 |
  203. | 70 | 9 |
  204. | 69 | 5 |
  205. | 68 | 5 |
  206. | 67 | 11 |
  207. | 66 | 6 |
  208. | 65 | 10 |
  209. | 64 | 9 |
  210. | 63 | 11 |
  211. | 62 | 13 |
  212. | 61 | 11 |
  213. | 60 | 13 |
  214. | 59 | 4 |
  215. | 58 | 11 |
  216. | 57 | 15 |
  217. | 56 | 17 |
  218. | 55 | 16 |
  219. | 54 | 8 |
  220. | 53 | 18 |
  221. | 52 | 22 |
  222. | 51 | 17 |
  223. | 50 | 22 |
  224. | 49 | 23 |
  225. | 48 | 15 |
  226. | 47 | 20 |
  227. | 46 | 27 |
  228. | 45 | 35 |
  229. | 44 | 32 |
  230. | 43 | 36 |
  231. | 42 | 36 |
  232. | 41 | 25 |
  233. | 40 | 24 |
  234. | 39 | 33 |
  235. | 38 | 25 |
  236. | 37 | 38 |
  237. | 36 | 28 |
  238. | 35 | 59 |
  239. | 34 | 35 |
  240. | 33 | 51 |
  241. | 32 | 78 |
  242. | 31 | 44 |
  243. | 30 | 65 |
  244. | 29 | 46 |
  245. | 28 | 69 |
  246. | 27 | 76 |
  247. | 26 | 79 |
  248. | 25 | 96 |
  249. | 24 | 90 |
  250. | 23 | 77 |
  251. | 22 | 126 |
  252. | 21 | 132 |
  253. | 20 | 166 |
  254. | 19 | 164 |
  255. | 18 | 230 |
  256. | 17 | 186 |
  257. | 16 | 209 |
  258. | 15 | 251 |
  259. | 14 | 261 |
  260. | 13 | 341 |
  261. | 12 | 292 |
  262. | 11 | 346 |
  263. | 10 | 402 |
  264. | 9 | 429 |
  265. | 8 | 505 |
  266. | 7 | 553 |
  267. | 6 | 677 |
  268. | 5 | 872 |
  269. | 4 | 1113 |
  270. | 3 | 1529 |
  271. | 2 | 2361 |
  272. | 1 | 4254 |
  273. +-----+----------+
  274. 116 rows in set (0.01 sec)
  275.  
  276. REMEMBER THAT PAGES THAT EXIST ONLY IN ENGLISH WILL NOT APPEAR IN THIS LIST!!!
  277.  
  278. #############################
  279.  
  280. Corresponding to 1/1/2013
  281. ID=1007
  282. UNIX_DAY=15706 -- 16070
  283. Need to convert this to hours = 376944 -- 385703
  284.  
  285. And this is the starting point for summing 2013 views BY LANG/PROJECT
  286.  
  287. mysql> CREATE TABLE wp_med_projects (PROJECT varchar(127), VIEWS bigint unsigned not null);
  288.  
  289. mysql> INSERT INTO wp_med_projects SELECT PROJECT,SUM(HITS) FROM projectcount
  290. WHERE UNIX_HOUR>=376944 AND UNIX_HOUR<=385703 AND PROJECT NOT LIKE "%.%" GROUP BY PROJECT;
  291.  
  292. 581 rows in this table, and a primary key is problematic
  293.  
  294. #############################
  295.  
  296. mysql> SELECT * FROM wp_med_projects ORDER BY VIEWS DESC LIMIT 20;
  297. mysql> SELECT * FROM wp_med_projects ORDER BY VIEWS DESC LIMIT 20;
  298. +---------+-------------+
  299. | PROJECT | VIEWS |
  300. +---------+-------------+
  301. | en | 91252704491 |
  302. | es | 14806944198 |
  303. | ja | 12535042404 |
  304. | ru | 12072227602 |
  305. | de | 11067376677 |
  306. | fr | 8305441200 |
  307. | it | 5738581496 |
  308. | pl | 5691502347 |
  309. | pt | 5266100175 |
  310. | zh | 3775941714 |
  311. | nl | 2198783176 |
  312. | tr | 1646386888 |
  313. | ar | 1287936214 |
  314. | sv | 1242519595 |
  315. | cs | 922966523 |
  316. | id | 853940379 |
  317. | ko | 807918557 |
  318. | uk | 789594746 |
  319. | fi | 720451053 |
  320. | th | 713506494 |
  321. +---------+-------------+
  322. 20 rows in set (0.00 sec)
  323.  
  324.  
  325. #############################
  326.  
  327. mysql> SELECT SUBSTRING(m.PAGE,1,2),SUM(m.year),(SELECT MAX(VIEWS) FROM
  328. wp_med_projects WHERE PROJECT=SUBSTRING(m.PAGE,1,2)) as PROJ,
  329. 100*(SUM(m.year)/(SELECT MAX(VIEWS) FROM wp_med_projects WHERE
  330. PROJECT=SUBSTRING(m.PAGE,1,2))) as PERCENT FROM wp_med_monthly m
  331. GROUP BY SUBSTRING(m.PAGE,1,2) ORDER BY SUM(m.year) DESC;
  332.  
  333. +-----------------------+-------------+-------------+-------------+
  334. | SUBSTRING(m.PAGE,1,2) | SUM(m.year) | PROJ | PERCENT |
  335. +-----------------------+-------------+-------------+-------------+
  336. | en | 2277403987 | 91252704491 | 2.4957 |
  337. | es | 658556909 | 14806944198 | 4.4476 |
  338. | de | 348355902 | 11067376677 | 3.1476 |
  339. | ja | 253740601 | 12535042404 | 2.0243 |
  340. | fr | 218532955 | 8305441200 | 2.6312 |
  341. | pt | 212465002 | 5266100175 | 4.0346 |
  342. | ru | 150162878 | 12072227602 | 1.2439 |
  343. | pl | 147488961 | 5691502347 | 2.5914 |
  344. | it | 147115245 | 5738581496 | 2.5636 |
  345. | nl | 75543607 | 2198783176 | 3.4357 |
  346. | zh | 36225111 | 3775941714 | 0.9594 |
  347. | tr | 33904861 | 1646386888 | 2.0593 |
  348. | ar | 33228397 | 1287936214 | 2.5800 |
  349. | sv | 30557682 | 1242519595 | 2.4593 |
  350. | fi | 22876930 | 720451053 | 3.1754 |
  351. | id | 22732356 | 853940379 | 2.6621 |
  352. | cs | 21955957 | 922966523 | 2.3788 |
  353. | fa | 17695775 | 688385541 | 2.5706 |
  354. | he | 16288412 | 584759276 | 2.7855 |
  355. | vi | 12449359 | 662259161 | 1.8798 |
  356. | hu | 12119560 | 634729654 | 1.9094 |
  357. | no | 12113834 | 596318064 | 2.0314 |
  358. | th | 11120603 | 713506494 | 1.5586 |
  359. | uk | 11050301 | 789594746 | 1.3995 |
  360. | hr | 8679378 | 277338602 | 3.1295 |
  361. | el | 7860165 | 322117346 | 2.4402 |
  362. | ro | 7805024 | 394144067 | 1.9802 |
  363. | bg | 7098983 | 345599276 | 2.0541 |
  364. | da | 6702468 | 399676672 | 1.6770 |
  365. | ko | 6543526 | 807918557 | 0.8099 |
  366. | ca | 4764627 | 298249613 | 1.5975 |
  367. | sk | 4279143 | 230244607 | 1.8585 [snip!]
  368.  
  369.  
  370. #############################
  371.  
  372. SELECT PROJECT,VARIANCE(HITS) FROM projectcount WHERE UNIX_HOUR>=376944
  373. AND UNIX_HOUR<=385703 AND PROJECT NOT LIKE "%.%" GROUP BY PROJECT;
  374.  
  375. SELECT PROJECT,VARIANCE(HITS) FROM projectcount WHERE UNIX_HOUR>=376944
  376. AND UNIX_HOUR<=385703 AND (PROJECT LIKE "en" OR PROJECT LIKE "it"
  377. OR PROJECT LIKE "zh" OR PROJECT LIKE "pt" OR PROJECT LIKE "de"
  378. OR PROJECT LIKE "fr" OR PROJECT LIKE "pl" OR PROJECT LIKE "ru"
  379. OR PROJECT LIKE "ja" OR PROJECT LIKE "es") GROUP BY PROJECT;
  380.  
  381. +---------+--------------------+
  382. | PROJECT | VARIANCE(HITS) |
  383. +---------+--------------------+
  384. | de | 398996828928.1165 | = 3.9 E11
  385. | en | 4133472736509.7593 | = 4.1 E12
  386. | es | 750902541698.0205 |
  387. | fr | 186084980422.9977 |
  388. | it | 113402184155.7467 |
  389. | ja | 347401633589.6532 |
  390. | pl | 52918051447.4765 |
  391. | pt | 91186214042.1714 |
  392. | ru | 435896954210.2971 |
  393. | zh | 22312185282.3384 |
  394. +---------+--------------------+
  395. 10 rows in set (12.58 sec)
  396.  
  397.  
  398. Top "en" variance articles per [top_variance.java]:
  399.  
  400. 1.1687814713924304E11 en Ricin
  401. 1.0235129531805556E11 en Tinnitus
  402. 5.2550090006354164E10 en Rorschach_test
  403. 4.7889255083020836E10 en Latent_inhibition
  404. 4.068264651888889E10 en Scientific_consensus
  405. 3.6692258284354164E10 en Veganism
  406. 3.6615616211076385E10 en Vegetarianism
  407. 3.3384832597833332E10 en Lacto_vegetarianism
  408. 2.8962499269388885E10 en Mastectomy
  409. 2.4725881447409725E10 en Patient_Protection_and_Affordable_Care_Act
  410. 2.2501433001805557E10 en Norovirus
  411. 2.2254987207020832E10 en Onychophagia
  412. 2.050006175025E10 en Sarin
  413. 1.7162366044E10 en Conjugate_gaze_palsy
  414. 1.6797197597333334E10 en Diaper
  415. 1.5511770852576387E10 en Leonardo_da_Vinci
  416. 1.2866544604076387E10 en Asperger_syndrome
  417. 1.1853506504E10 en Pseudofolliculitis_barbae
  418. 1.1301288210555557E10 en Bronchitis
  419. 9.5828020621875E9 en Desomorphine
  420. 8.810964324520834E9 en Blind_spot_(vision)
  421. 8.644234368354166E9 en Psychosis
  422. 8.600064082805555E9 en Influenza
  423. 8.599932074020834E9 en Bipolar_disorder
  424. 8.566960627888889E9 en Down_syndrome
  425. 8.061887144743056E9 en Blue_field_entoptic_phenomenon
  426.  
  427.  
  428. #############################
  429.  
  430. Other questions to ask of the data:
  431. * Articles with greatest month to month variance
  432. * Articles with greatest inter-country variance
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement