Advertisement
westand

Various notes on WP:MED stats (updated!)

Jan 16th, 2014
436
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #############################################################################
  2. #
  3. # Andrew G. West - notes.txt - Notes about the WP:MED stats findings
  4. #
  5. #############################################################################
  6.  
  7. Top ten WP's by views:
  8.  
  9. en English
  10. es Spanish
  11. ru Russian
  12. ja Japanese
  13. de German
  14. fr French
  15. pt Portuguese
  16. it Italian
  17. pl Polish
  18. zh Chinese
  19.  
  20. #############################
  21.  
  22. mysql> SELECT SUBSTRING(PAGE,1,2),COUNT(*),SUM(year) FROM wp_med_monthly
  23. GROUP BY SUBSTRING(PAGE,1,2) ORDER BY COUNT(*) DESC;
  24.  
  25. +---------------------+----------+------------+
  26. | SUBSTRING(PAGE,1,2) | COUNT(*) | SUM(year) |
  27. +---------------------+----------+------------+
  28. | en | 29072 | 2277403987 |
  29. | de | 7761 | 348355902 |
  30. | fr | 6390 | 218532955 |
  31. | es | 6367 | 658556909 |
  32. | pl | 5999 | 147488961 |
  33. | it | 5677 | 147115245 |
  34. | pt | 5269 | 212465002 |
  35. | ru | 4865 | 150162878 |
  36. | nl | 4391 | 75543607 |
  37. | ja | 4303 | 253740601 |
  38. | ar | 4220 | 33228397 |
  39. | sv | 3661 | 30557682 |
  40. | zh | 3426 | 36225111 |
  41. | ca | 2992 | 4764627 |
  42. | fa | 2843 | 17695775 |
  43. | fi | 2825 | 22876930 |
  44. | he | 2389 | 16288412 |
  45. | cs | 2360 | 21955957 |
  46. | uk | 2291 | 11050301 |
  47. | no | 2186 | 12113834 |
  48. | tr | 1822 | 33904861 |
  49. | sr | 1775 | 2665627 |
  50. | ko | 1692 | 6543526 |
  51. | si | 1600 | 4251231 |
  52. | hu | 1528 | 12119560 |
  53. | da | 1433 | 6702468 |
  54. | id | 1431 | 22732356 |
  55. | hr | 1396 | 8679378 |
  56. | th | 1278 | 11120603 |
  57. | bg | 1242 | 7098983 |
  58. | ro | 1225 | 7805024 |
  59. | sl | 1191 | 3063597 |
  60. | gl | 1173 | 614948 |
  61. | lt | 1005 | 4123914 |
  62. | sk | 1000 | 4279143 | [snip!]
  63. 203 rows in set (0.17 sec)
  64.  
  65. #############################
  66.  
  67. mysql> SELECT SUM(year) FROM wp_med_monthly;
  68.  
  69. +------------+
  70. | SUM(year) |
  71. +------------+
  72. | 4,880,133,770 |
  73. +------------+
  74. 1 row in set (0.11 sec)
  75.  
  76. #############################
  77.  
  78. mysql> SELECT PAGE,COUNT(*)+1 FROM wp_med_map m, wp_med_monthly p
  79. WHERE m.en_id=p.id GROUP BY en_id ORDER BY COUNT(*) DESC LIMIT 100;
  80.  
  81. ADD 1 TO ACCOUNT FOR ENGLISH ENTRY NOT IN MAP TABLE!!!
  82.  
  83. +------------------------------------------------------+------------+
  84. | PAGE | COUNT(*)+1 |
  85. +------------------------------------------------------+------------+
  86. | en Leonardo_da_Vinci | 180 |
  87. | en Human | 165 |
  88. | en Human_biology | 165 |
  89. | en Medicine | 148 |
  90. | en Medical_practice | 148 |
  91. | en Blood | 137 |
  92. | en Fungus | 134 |
  93. | en HIV/AIDS | 128 |
  94. | en Brain_cell | 127 |
  95. | en Brain_function | 127 |
  96. | en Bacteria | 125 |
  97. | en Metastatic_tuberculous_ulcer | 123 |
  98. | en Tuberculosis | 123 |
  99. | en Tubercular_scar | 123 |
  100. | en Louis_Pasteur | 122 |
  101. | en Cancer_patient | 119 |
  102. | en Cancer | 119 |
  103. | en Anatomy | 117 |
  104. | en Mosquito_bite | 115 |
  105. | en World_Health_Organization | 115 |
  106. | en Mosquito | 115 |
  107. | en Diabetes_mellitus | 113 |
  108. | en Disorder_(medicine) | 113 |
  109. | en Disease | 113 |
  110. | en Illness | 113 |
  111. | en Head | 111 |
  112. | en Cerebral_malaria | 111 |
  113. | en Virus | 111 |
  114. | en Malaria | 111 |
  115. | en Bromine | 110 |
  116. | en Death | 110 |
  117. | en Health | 109 |
  118. | en Cholera | 105 |
  119. | en Neck | 103 |
  120. | en Influenza | 103 |
  121. | en Full_term | 101 |
  122. | en Pregnancy | 101 |
  123. | en Biochemistry | 101 |
  124. | en Avicenna | 101 |
  125. | en Angelina_Jolie_cancer_treatment | 98 |
  126. | en Abortion | 97 |
  127. | en Sexual_intercourse | 96 |
  128. | en Hippocrates | 96 |
  129. | en Hospital | 95 |
  130. | en International_Red_Cross_and_Red_Crescent_Movement | 94 |
  131. | en X-Ray | 92 |
  132. | en Genetics | 92 |
  133. | en X-ray | 92 |
  134. | en Suicide | 91 |
  135. | en Alzheimer's_disease | 90 |
  136. | en Pneumonia | 90 |
  137. | en Microbiology | 89 |
  138. | en Lip | 88 |
  139. | en Hormone | 88 |
  140. | en Sleep | 88 |
  141. | en Physiology | 88 |
  142. | en Obesity | 87 |
  143. | en Smallpox | 87 |
  144. | en Variola_major | 87 |
  145. | en Pharmaceutical_drug | 86 |
  146. | en Condom | 86 |
  147. [snip]!
  148.  
  149. #############################
  150.  
  151. mysql> CREATE TEMPORARY TABLE IF NOT EXISTS wp_med_temp AS
  152. (SELECT PAGE,COUNT(*)+1 as A FROM wp_med_map m, wp_med_monthly p
  153. WHERE m.en_id=p.id GROUP BY en_id ORDER BY COUNT(*));
  154.  
  155. mysql> SELECT A,COUNT(*) FROM wp_med_temp GROUP BY A ORDER BY A DESC;
  156. +-----+----------+
  157. | A | COUNT(*) |
  158. +-----+----------+
  159. | 180 | 1 |
  160. | 165 | 2 |
  161. | 148 | 2 |
  162. | 137 | 1 |
  163. | 134 | 1 |
  164. | 128 | 1 |
  165. | 127 | 2 |
  166. | 125 | 1 |
  167. | 123 | 3 |
  168. | 122 | 1 |
  169. | 119 | 2 |
  170. | 117 | 1 |
  171. | 115 | 3 |
  172. | 113 | 4 |
  173. | 111 | 4 |
  174. | 110 | 2 |
  175. | 109 | 1 |
  176. | 105 | 1 |
  177. | 103 | 2 |
  178. | 101 | 4 |
  179. | 98 | 1 |
  180. | 97 | 1 |
  181. | 96 | 2 |
  182. | 95 | 1 |
  183. | 94 | 1 |
  184. | 92 | 3 |
  185. | 91 | 1 |
  186. | 90 | 2 |
  187. | 89 | 1 |
  188. | 88 | 4 |
  189. | 87 | 3 |
  190. | 86 | 2 |
  191. | 85 | 3 |
  192. | 84 | 6 |
  193. | 83 | 3 |
  194. | 82 | 8 |
  195. | 81 | 5 |
  196. | 80 | 5 |
  197. | 79 | 2 |
  198. | 78 | 5 |
  199. | 77 | 7 |
  200. | 76 | 6 |
  201. | 75 | 6 |
  202. | 74 | 6 |
  203. | 73 | 3 |
  204. | 72 | 6 |
  205. | 71 | 9 |
  206. | 70 | 5 |
  207. | 69 | 5 |
  208. | 68 | 11 |
  209. | 67 | 6 |
  210. | 66 | 10 |
  211. | 65 | 9 |
  212. | 64 | 11 |
  213. | 63 | 13 |
  214. | 62 | 11 |
  215. | 61 | 13 |
  216. | 60 | 4 |
  217. | 59 | 11 |
  218. | 58 | 15 |
  219. | 57 | 17 |
  220. | 56 | 16 |
  221. | 55 | 8 |
  222. | 54 | 18 |
  223. | 53 | 22 |
  224. | 52 | 17 |
  225. | 51 | 22 |
  226. | 50 | 23 |
  227. | 49 | 15 |
  228. | 48 | 20 |
  229. | 47 | 27 |
  230. | 46 | 35 |
  231. | 45 | 32 |
  232. | 44 | 36 |
  233. | 43 | 36 |
  234. | 42 | 25 |
  235. | 41 | 24 |
  236. | 40 | 33 |
  237. | 39 | 25 |
  238. | 38 | 38 |
  239. | 37 | 28 |
  240. | 36 | 59 |
  241. | 35 | 35 |
  242. | 34 | 51 |
  243. | 33 | 78 |
  244. | 32 | 44 |
  245. | 31 | 65 |
  246. | 30 | 46 |
  247. | 29 | 69 |
  248. | 28 | 76 |
  249. | 27 | 79 |
  250. | 26 | 96 |
  251. | 25 | 90 |
  252. | 24 | 77 |
  253. | 23 | 126 |
  254. | 22 | 132 |
  255. | 21 | 166 |
  256. | 20 | 164 |
  257. | 19 | 230 |
  258. | 18 | 186 |
  259. | 17 | 209 |
  260. | 16 | 251 |
  261. | 15 | 261 |
  262. | 14 | 341 |
  263. | 13 | 292 |
  264. | 12 | 346 |
  265. | 11 | 402 |
  266. | 10 | 429 |
  267. | 9 | 505 |
  268. | 8 | 553 |
  269. | 7 | 677 |
  270. | 6 | 872 |
  271. | 5 | 1113 |
  272. | 4 | 1529 |
  273. | 3 | 2361 |
  274. | 2 | 4254 |
  275. +-----+----------+
  276. 116 rows in set (0.01 sec)
  277.  
  278.  
  279. REMEMBER THAT PAGES THAT EXIST ONLY IN ENGLISH WILL NOT APPEAR IN THIS LIST!!!
  280.  
  281. #############################
  282.  
  283. Corresponding to 1/1/2013
  284. ID=1007
  285. UNIX_DAY=15706 -- 16070
  286. Need to convert this to hours = 376944 -- 385703
  287.  
  288. And this is the starting point for summing 2013 views BY LANG/PROJECT
  289.  
  290. mysql> CREATE TABLE wp_med_projects (PROJECT varchar(127), VIEWS bigint unsigned not null);
  291.  
  292. mysql> INSERT INTO wp_med_projects SELECT PROJECT,SUM(HITS) FROM projectcount
  293. WHERE UNIX_HOUR>=376944 AND UNIX_HOUR<=385703 AND PROJECT NOT LIKE "%.%" GROUP BY PROJECT;
  294.  
  295. 581 rows in this table, and a primary key is problematic
  296.  
  297. #############################
  298.  
  299. mysql> SELECT * FROM wp_med_projects ORDER BY VIEWS DESC LIMIT 20;
  300. mysql> SELECT * FROM wp_med_projects ORDER BY VIEWS DESC LIMIT 20;
  301. +---------+-------------+
  302. | PROJECT | VIEWS |
  303. +---------+-------------+
  304. | en | 91252704491 |
  305. | es | 14806944198 |
  306. | ja | 12535042404 |
  307. | ru | 12072227602 |
  308. | de | 11067376677 |
  309. | fr | 8305441200 |
  310. | it | 5738581496 |
  311. | pl | 5691502347 |
  312. | pt | 5266100175 |
  313. | zh | 3775941714 |
  314. | nl | 2198783176 |
  315. | tr | 1646386888 |
  316. | ar | 1287936214 |
  317. | sv | 1242519595 |
  318. | cs | 922966523 |
  319. | id | 853940379 |
  320. | ko | 807918557 |
  321. | uk | 789594746 |
  322. | fi | 720451053 |
  323. | th | 713506494 |
  324. +---------+-------------+
  325. 20 rows in set (0.00 sec)
  326.  
  327.  
  328. #############################
  329.  
  330. mysql> SELECT SUBSTRING(m.PAGE,1,2),SUM(m.year),(SELECT MAX(VIEWS) FROM
  331. wp_med_projects WHERE PROJECT=SUBSTRING(m.PAGE,1,2)) as PROJ,
  332. 100*(SUM(m.year)/(SELECT MAX(VIEWS) FROM wp_med_projects WHERE
  333. PROJECT=SUBSTRING(m.PAGE,1,2))) as PERCENT FROM wp_med_monthly m
  334. GROUP BY SUBSTRING(m.PAGE,1,2) ORDER BY SUM(m.year) DESC;
  335.  
  336. +-----------------------+-------------+-------------+-------------+
  337. | SUBSTRING(m.PAGE,1,2) | SUM(m.year) | PROJ | PERCENT |
  338. +-----------------------+-------------+-------------+-------------+
  339. | en | 2277403987 | 91252704491 | 2.4957 |
  340. | es | 658556909 | 14806944198 | 4.4476 |
  341. | de | 348355902 | 11067376677 | 3.1476 |
  342. | ja | 253740601 | 12535042404 | 2.0243 |
  343. | fr | 218532955 | 8305441200 | 2.6312 |
  344. | pt | 212465002 | 5266100175 | 4.0346 |
  345. | ru | 150162878 | 12072227602 | 1.2439 |
  346. | pl | 147488961 | 5691502347 | 2.5914 |
  347. | it | 147115245 | 5738581496 | 2.5636 |
  348. | nl | 75543607 | 2198783176 | 3.4357 |
  349. | zh | 36225111 | 3775941714 | 0.9594 |
  350. | tr | 33904861 | 1646386888 | 2.0593 |
  351. | ar | 33228397 | 1287936214 | 2.5800 |
  352. | sv | 30557682 | 1242519595 | 2.4593 |
  353. | fi | 22876930 | 720451053 | 3.1754 |
  354. | id | 22732356 | 853940379 | 2.6621 |
  355. | cs | 21955957 | 922966523 | 2.3788 |
  356. | fa | 17695775 | 688385541 | 2.5706 |
  357. | he | 16288412 | 584759276 | 2.7855 |
  358. | vi | 12449359 | 662259161 | 1.8798 |
  359. | hu | 12119560 | 634729654 | 1.9094 |
  360. | no | 12113834 | 596318064 | 2.0314 |
  361. | th | 11120603 | 713506494 | 1.5586 |
  362. | uk | 11050301 | 789594746 | 1.3995 |
  363. | hr | 8679378 | 277338602 | 3.1295 |
  364. | el | 7860165 | 322117346 | 2.4402 |
  365. | ro | 7805024 | 394144067 | 1.9802 |
  366. | bg | 7098983 | 345599276 | 2.0541 |
  367. | da | 6702468 | 399676672 | 1.6770 |
  368. | ko | 6543526 | 807918557 | 0.8099 |
  369. | ca | 4764627 | 298249613 | 1.5975 |
  370. | sk | 4279143 | 230244607 | 1.8585 [snip!]
  371.  
  372.  
  373. #############################
  374.  
  375. SELECT PROJECT,VARIANCE(HITS) FROM projectcount WHERE UNIX_HOUR>=376944
  376. AND UNIX_HOUR<=385703 AND PROJECT NOT LIKE "%.%" GROUP BY PROJECT;
  377.  
  378. SELECT PROJECT,VARIANCE(HITS) FROM projectcount WHERE UNIX_HOUR>=376944
  379. AND UNIX_HOUR<=385703 AND (PROJECT LIKE "en" OR PROJECT LIKE "it"
  380. OR PROJECT LIKE "zh" OR PROJECT LIKE "pt" OR PROJECT LIKE "de"
  381. OR PROJECT LIKE "fr" OR PROJECT LIKE "pl" OR PROJECT LIKE "ru"
  382. OR PROJECT LIKE "ja" OR PROJECT LIKE "es") GROUP BY PROJECT;
  383.  
  384. +---------+--------------------+
  385. | PROJECT | VARIANCE(HITS) |
  386. +---------+--------------------+
  387. | de | 398996828928.1165 | = 3.9 E11
  388. | en | 4133472736509.7593 | = 4.1 E12
  389. | es | 750902541698.0205 |
  390. | fr | 186084980422.9977 |
  391. | it | 113402184155.7467 |
  392. | ja | 347401633589.6532 |
  393. | pl | 52918051447.4765 |
  394. | pt | 91186214042.1714 |
  395. | ru | 435896954210.2971 |
  396. | zh | 22312185282.3384 |
  397. +---------+--------------------+
  398. 10 rows in set (12.58 sec)
  399.  
  400.  
  401. Top "en" variance articles per [top_variance.java]:
  402.  
  403. 1.1687814713924304E11 en Ricin
  404. 1.0235129531805556E11 en Tinnitus
  405. 5.2550090006354164E10 en Rorschach_test
  406. 4.7889255083020836E10 en Latent_inhibition
  407. 4.068264651888889E10 en Scientific_consensus
  408. 3.6692258284354164E10 en Veganism
  409. 3.6615616211076385E10 en Vegetarianism
  410. 3.3384832597833332E10 en Lacto_vegetarianism
  411. 2.8962499269388885E10 en Mastectomy
  412. 2.4725881447409725E10 en Patient_Protection_and_Affordable_Care_Act
  413. 2.2501433001805557E10 en Norovirus
  414. 2.2254987207020832E10 en Onychophagia
  415. 2.050006175025E10 en Sarin
  416. 1.7162366044E10 en Conjugate_gaze_palsy
  417. 1.6797197597333334E10 en Diaper
  418. 1.5511770852576387E10 en Leonardo_da_Vinci
  419. 1.2866544604076387E10 en Asperger_syndrome
  420. 1.1853506504E10 en Pseudofolliculitis_barbae
  421. 1.1301288210555557E10 en Bronchitis
  422. 9.5828020621875E9 en Desomorphine
  423. 8.810964324520834E9 en Blind_spot_(vision)
  424. 8.644234368354166E9 en Psychosis
  425. 8.600064082805555E9 en Influenza
  426. 8.599932074020834E9 en Bipolar_disorder
  427. 8.566960627888889E9 en Down_syndrome
  428. 8.061887144743056E9 en Blue_field_entoptic_phenomenon
  429.  
  430.  
  431. #############################
  432.  
  433. Other questions to ask of the data:
  434. * Articles with greatest month to month variance
  435. * Articles with greatest inter-country variance
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement