Advertisement
Guest User

Untitled

a guest
Aug 29th, 2018
188
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 94.49 KB | None | 0 0
  1. {
  2. "metadata": {
  3. "name": "",
  4. "signature": "sha256:962bc8e068d054a027f1195730bba2115cb820266aabc57ae123f73d5294650f"
  5. },
  6. "nbformat": 3,
  7. "nbformat_minor": 0,
  8. "worksheets": [
  9. {
  10. "cells": [
  11. {
  12. "cell_type": "markdown",
  13. "metadata": {},
  14. "source": [
  15. "## Data Cleanup Notebook\n",
  16. "\n",
  17. "Some thoughts before I get started with this. \n",
  18. " \n",
  19. " * Start by removing short term users. Recalculate for repeatability\n",
  20. " \n",
  21. " * Remove blank SIDs, instructors, and test courses (gatech,devcourse, and lightly used courses) \n",
  22. " \n",
  23. " * Remove nontraditional pages (those not begining with courselib or runestone \n",
  24. " \n",
  25. " * Remove unofficial div_ids\n",
  26. " \n",
  27. " * Remove outliers. Recalculate them for repeatability\n",
  28. " \n",
  29. " * Replace runestone with courselib\n",
  30. " \n",
  31. " * Replace mChoice with mchoice for consistency\n",
  32. " \n",
  33. " \n",
  34. "THINGS YOU NEED BEFORE RUNNING THIS NOTEBOOK:\n",
  35. "\n",
  36. "Tables:\n",
  37. " * useinfo (or something similar)\n",
  38. " * courses\n",
  39. " \n",
  40. "Python Scripts:\n",
  41. " * bookParse.py\n",
  42. " * getDivs.py\n"
  43. ]
  44. },
  45. {
  46. "cell_type": "code",
  47. "collapsed": false,
  48. "input": [
  49. "import getDivs\n",
  50. "import bookParse as bp\n",
  51. "from getInstructors import getInstructors\n",
  52. "import re\n",
  53. "import pandas as pd\n",
  54. "import psycopg2\n",
  55. "import socket\n",
  56. "import numpy as np\n",
  57. "import pandas.io.sql as psql\n",
  58. "from pandas import DataFrame\n",
  59. "from sqlalchemy import create_engine\n",
  60. "\n",
  61. "db_to_clean = 'bmiller'\n",
  62. "db_user = 'bmiller'\n",
  63. "db_pass = ''\n",
  64. "\n",
  65. "if db_pass:\n",
  66. " engine2 = create_engine('postgresql+psycopg2://%s:%s@localhost/%s' % (db_user, db_pass, db_to_clean))\n",
  67. " conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user,password=db_pass)\n",
  68. "else:\n",
  69. " engine2 = create_engine('postgresql+psycopg2://%s@localhost/%s' % (db_user, db_to_clean))\n",
  70. " conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user)\n",
  71. "\n",
  72. "connection2 = engine2.connect()\n",
  73. "\n",
  74. "\n",
  75. "%load_ext sql \n",
  76. "%sql postgresql://bmiller@localhost/bmiller"
  77. ],
  78. "language": "python",
  79. "metadata": {},
  80. "outputs": [
  81. {
  82. "output_type": "stream",
  83. "stream": "stdout",
  84. "text": [
  85. "The sql extension is already loaded. To reload it, use:\n",
  86. " %reload_ext sql\n"
  87. ]
  88. },
  89. {
  90. "metadata": {},
  91. "output_type": "pyout",
  92. "prompt_number": 47,
  93. "text": [
  94. "'Connected: bmiller@bmiller'"
  95. ]
  96. }
  97. ],
  98. "prompt_number": 47
  99. },
  100. {
  101. "cell_type": "heading",
  102. "level": 3,
  103. "metadata": {},
  104. "source": [
  105. "Remove non-traditional pages, blank sids, and instructors"
  106. ]
  107. },
  108. {
  109. "cell_type": "heading",
  110. "level": 6,
  111. "metadata": {},
  112. "source": [
  113. "Before running the following query, make sure to insert the most recent list of instructors."
  114. ]
  115. },
  116. {
  117. "cell_type": "code",
  118. "collapsed": false,
  119. "input": [
  120. "%%sql\n",
  121. "DELETE FROM useinfo\n",
  122. "WHERE (event = 'page' AND NOT (div_id ~ '^/(courselib|runestone).*')) --#non-traditional pages\n",
  123. "OR sid = '' --#blank sids\n",
  124. "OR sid IN ('DaveCousins', '01418116', 'sbcmharrison', 'mrcpittway', 'paolobianconi', 'OrganizeFISH', 'brekke', 'haugrudm', 'fnurl', 'Martin Highmore', 'gisc', 'jstephens', 'pguse', 'klaidley', 'Nobula', 'ivipul', 'dgyanek', 'Nicholas Seward', 'asahny', 'chorse', 'rschulz', 'nedoluzhko', 'sara', 'memedu', 'swbcteach', 'bula', 'pbeens', 'wmp', 'potterwalt', 'walkermsum', 'acsizmadia', 'the.krup', 'the_bob_student', 'the_bob', 'jgreenawalt', 'Mcintoshchs', 'gkondrak', 'a.reid', 'mckeownp', 'timbellnz', 'quintin', 'rahmani', 'tiverson', 'gccktaira', 'vtn2', 'bmiller', 'rosew', 'jbschafer', 'keen', 'sgilbert@occ.cccd.edu', 'as2653', 'RKumar', 'dcaswell', 'dzolzer', 'gvrinten', 'minsu', 'mzmacky2', 'ktaira', 'rkowaney', 'WarrenPettitt', 'tompkins', 'tompkinsJack', 'HullSmurf001', 'tagliarinig', 'CSC131tompkins', 'vellinga', 'pearcej', 'rambasnet', 'mdogy', 'cmurakami', 'ficekr', 'ficek', 'wilf.lendrum', 'shlomor', 'jazcap', 'dlovell', 'alter718', 'polarlemniscate', 'acbart', 'pmisterovich', 'CheeseTurtle', 'mr.octavio', 'mirita81', 'shyampurk', 'robertcduvall', 'ludaesch', 'rpugh', 'khelson1', 'emcpPYTHON', 'horacers', 'JohnDoorenbos', 'dunfeer', 'techplex.engineer', 'misa.tesi', 'kwhillans', 'dr_bwilson', 'r70smith', 'tand', 'bschormann', 'traveltcg', 'mridgway', 'hhtsimpson', 'bmoening', 'mhilland', 'pam.meier', 'tdragon@ithaca.edu', 'bokhariz', 'MrYantho', 'ine5201', 'avunque', 'rasika', 'dtgreen', 'camingus', 'hogue', 'tstrat', 'stormeb', 'lwchadd', 'xty.dsouza', 'elevel', 'ThaddeusAid', 'davidUrr', 'hitoshi', 'mark.schmitt', 'eallatta', 'mfitzpatrick', 'vcowal', 'salihtuncer', 'warrenpettitt', 'pylearner', 'william.levi.collins', 'Barrie', 'Jacob777', 'vfxpraveen117', 'aoteatech', 'nask.meerpaal', 'jsolin', 'd.giordan', 'tmoffdawg', 'scmreason', 'Mr_Gray', 'case.andrew', 'trampgeek', 'niroshseven', 'decadous', 'MrBate', 'stcrispin', 'mieke.gorissen', 'englert.ervin', 'ericsonga', 'mithron', 'numodica', 'Mathad19', 'JenksW', 'LonnekeDriessen', 'kwaljee', 'larry', 'pentrium', 'moffdawg', 'maxmori77', 'SandeepShelke', 'davelai', 'tommrrtt', 'spock_tpol', 'ekeoid', 'BantamJoe', 'alter718_2', 'MegW', 'elanus', 'cn.gheorghetiteica', 'yoyo_available', 'sdmcgee', 'rdotsch', 'gjvoigt', 'gmacmanus', 'mharrison', 'jfemiani', 'femiani', 'kwright', 'dekrwright', 'jenniferchoffmann', 'mrschellenberg', 'The Salty Seaman', 'shane.dicks1', 'mleblanc', 'ProfNorman', 'metrpy', 'gregmax', 'ironwk85', 'hannutam', 'andrewgtp', 'realblurgh', 'madhavjha', 'acase', 'mchen53', 'jgilgen', 'ofenerci', 'JackTompkins', 'mdeakyne', 'spsofficers', 'comqsph', 'SteveTanimoto', 'jiffp', 'rkumar', 'Cmcguire', 'jheckelsmiller', 'OvidioMarinho', 'dianneoc', 'AppylPye', 'cwormsl2', 'kidbookrev', 'bic', 'afseTestAccount', 'vivek.lpc', '10081453', 'kaspermunch', 'supamrchen', 'johnkershaw', 'mesri', 'jfall', 'cheriegilmore', 'cewing', 'beth-mac', 'mcraig', 'jspacco', 'ranum', 'bcooper', 'xuxiaomin19892', 'cryptogoth', 'rowzer', 'allen', 'isaacdl', 'james adams', 'sciberteach', 'perry.fizzano', 'Green1313', 'cmheisel', 'wavemind', 'steven_king', 'assault', 'jgb', 'komo', 'gjenkins@lbcc.edu', 'FEdericoSOmaschini', 'david236', 'zarial01', 'miguser1', 'kev_jones', 'JuYoungKim', 'garyphillips', 'jasonlanda', 'kfkennedy', 'holcma01', 'jrincon@emsofl.com', 'johnpeeb', 'eudaimonious', 'VineetaSingh', 'rmueller', 'kash41', 'rtindell', 'mike.farfaraway', 'vilipold', 'weihongw', 'tonisvahi', 'rafaja', 'aliev', 'zacharyblackwood', 'smckennitt', 'drtech', 'bnminstruct', 'elliott', 'mleblanc@wheatoncollege.edu', 'CameronA', 'nicokie', 'volcan', 'matthewvenn', 'ttttamaki', 'Terry.Pasley@kctcs.edu', 'mpearse', 'gabrinerd', 'danschellenberg', 'alnitak')\n",
  125. "--#list of instructors\n",
  126. ";\n",
  127. "\n",
  128. "COMMIT;"
  129. ],
  130. "language": "python",
  131. "metadata": {},
  132. "outputs": [
  133. {
  134. "output_type": "stream",
  135. "stream": "stdout",
  136. "text": [
  137. "97999 rows affected.\n",
  138. "Done.\n"
  139. ]
  140. },
  141. {
  142. "metadata": {},
  143. "output_type": "pyout",
  144. "prompt_number": 48,
  145. "text": [
  146. "[]"
  147. ]
  148. }
  149. ],
  150. "prompt_number": 48
  151. },
  152. {
  153. "cell_type": "heading",
  154. "level": 3,
  155. "metadata": {},
  156. "source": [
  157. "Remove known test courses. "
  158. ]
  159. },
  160. {
  161. "cell_type": "heading",
  162. "level": 6,
  163. "metadata": {},
  164. "source": [
  165. "Currently limited to just gatch and devcourse. Will revise when known test courses has been updated."
  166. ]
  167. },
  168. {
  169. "cell_type": "code",
  170. "collapsed": false,
  171. "input": [
  172. "%%sql\n",
  173. "DELETE FROM useinfo\n",
  174. "WHERE course_id IN ('gatech', 'devcourse','overview');\n",
  175. "\n",
  176. "COMMIT;"
  177. ],
  178. "language": "python",
  179. "metadata": {},
  180. "outputs": [
  181. {
  182. "output_type": "stream",
  183. "stream": "stdout",
  184. "text": [
  185. "106732 rows affected.\n",
  186. "Done.\n"
  187. ]
  188. },
  189. {
  190. "metadata": {},
  191. "output_type": "pyout",
  192. "prompt_number": 49,
  193. "text": [
  194. "[]"
  195. ]
  196. }
  197. ],
  198. "prompt_number": 49
  199. },
  200. {
  201. "cell_type": "heading",
  202. "level": 3,
  203. "metadata": {},
  204. "source": [
  205. "Remove unofficial div_ids"
  206. ]
  207. },
  208. {
  209. "cell_type": "code",
  210. "collapsed": false,
  211. "input": [
  212. "pythondsDivInfo = getDivs.getDivInfo('pythonds')\n",
  213. "pythondsDivs = pythondsDivInfo['div_id']\n",
  214. "str(pythondsDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list"
  215. ],
  216. "language": "python",
  217. "metadata": {},
  218. "outputs": [
  219. {
  220. "metadata": {},
  221. "output_type": "pyout",
  222. "prompt_number": 50,
  223. "text": [
  224. "\"['active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode']\""
  225. ]
  226. }
  227. ],
  228. "prompt_number": 50
  229. },
  230. {
  231. "cell_type": "code",
  232. "collapsed": false,
  233. "input": [
  234. "%%sql\n",
  235. "DELETE FROM useinfo\n",
  236. "WHERE id in \n",
  237. "(\n",
  238. "SELECT useinfo.id FROM \n",
  239. "useinfo\n",
  240. "JOIN bookcourses\n",
  241. "ON (useinfo.course_id = bookcourses.course_name)\n",
  242. "WHERE \n",
  243. " book = 'pythonds' \n",
  244. " AND event NOT IN ('page','highlight','ac_error')\n",
  245. " AND div_id NOT IN ('active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode')\n",
  246. ");\n",
  247. "\n",
  248. "COMMIT;"
  249. ],
  250. "language": "python",
  251. "metadata": {},
  252. "outputs": [
  253. {
  254. "output_type": "stream",
  255. "stream": "stdout",
  256. "text": [
  257. "31112 rows affected.\n",
  258. "Done.\n"
  259. ]
  260. },
  261. {
  262. "metadata": {},
  263. "output_type": "pyout",
  264. "prompt_number": 51,
  265. "text": [
  266. "[]"
  267. ]
  268. }
  269. ],
  270. "prompt_number": 51
  271. },
  272. {
  273. "cell_type": "code",
  274. "collapsed": false,
  275. "input": [
  276. "thinkcspyDivInfo = getDivs.getDivInfo('thinkcspy')\n",
  277. "thinkcspyDivs = thinkcspyDivInfo['div_id']\n",
  278. "str(thinkcspyDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list"
  279. ],
  280. "language": "python",
  281. "metadata": {},
  282. "outputs": [
  283. {
  284. "metadata": {},
  285. "output_type": "pyout",
  286. "prompt_number": 52,
  287. "text": [
  288. "\"['chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2']\""
  289. ]
  290. }
  291. ],
  292. "prompt_number": 52
  293. },
  294. {
  295. "cell_type": "code",
  296. "collapsed": false,
  297. "input": [
  298. "%%sql\n",
  299. "DELETE FROM useinfo\n",
  300. "WHERE id IN\n",
  301. "(\n",
  302. "SELECT useinfo.id FROM useinfo\n",
  303. "JOIN bookcourses\n",
  304. "ON (useinfo.course_id = bookcourses.course_name)\n",
  305. "WHERE \n",
  306. " book = 'thinkcspy' \n",
  307. " AND event NOT IN ('page','highlight','ac_error')\n",
  308. " AND div_id NOT IN ('chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2')\n",
  309. ");\n",
  310. "COMMIT;"
  311. ],
  312. "language": "python",
  313. "metadata": {},
  314. "outputs": [
  315. {
  316. "output_type": "stream",
  317. "stream": "stdout",
  318. "text": [
  319. "961911 rows affected.\n",
  320. "Done.\n"
  321. ]
  322. },
  323. {
  324. "metadata": {},
  325. "output_type": "pyout",
  326. "prompt_number": 53,
  327. "text": [
  328. "[]"
  329. ]
  330. }
  331. ],
  332. "prompt_number": 53
  333. },
  334. {
  335. "cell_type": "heading",
  336. "level": 3,
  337. "metadata": {},
  338. "source": [
  339. "Find and remove infered test courses or inactive courses"
  340. ]
  341. },
  342. {
  343. "cell_type": "heading",
  344. "level": 6,
  345. "metadata": {},
  346. "source": [
  347. "In order to clean out test courses, we need to infer which book goes with each course. This will result in the creation of a bookcourses table, a table similar to courses but with the book tacked on the end."
  348. ]
  349. },
  350. {
  351. "cell_type": "code",
  352. "collapsed": false,
  353. "input": [
  354. "class CoursesForBook():\n",
  355. " def __init__(self,book):\n",
  356. " self.name = book\n",
  357. " self.courses = []\n",
  358. " self.keyWords = []\n",
  359. " def addCourse(self,course):\n",
  360. " if course not in self.courses:\n",
  361. " self.courses.append(course)\n",
  362. " def addKeyWords(self, wordlst):\n",
  363. " for word in wordlst:\n",
  364. " self.keyWords.append(word)\n",
  365. " def __str__(self):\n",
  366. " string = self.name + '\\n'+''\n",
  367. " for i in self.courses:\n",
  368. " string +=i + '\\n'\n",
  369. " return string"
  370. ],
  371. "language": "python",
  372. "metadata": {},
  373. "outputs": [],
  374. "prompt_number": 54
  375. },
  376. {
  377. "cell_type": "code",
  378. "collapsed": false,
  379. "input": [
  380. "book = bp.makeBookOutline('pythonds')\n",
  381. "book2 = bp.makeBookOutline('thinkcspy')\n",
  382. "\n",
  383. "pythonds = CoursesForBook('pythonds')\n",
  384. "pythonds.addKeyWords(['Analysis','Trees','BasicDS'])\n",
  385. "thinkcspy = CoursesForBook('thinkcspy')\n",
  386. "thinkcspy.addKeyWords(['Labs','SimplePythonData','GeneralIntro', 'moreiteration', 'helloturtle'])"
  387. ],
  388. "language": "python",
  389. "metadata": {},
  390. "outputs": [],
  391. "prompt_number": 55
  392. },
  393. {
  394. "cell_type": "code",
  395. "collapsed": false,
  396. "input": [
  397. "thinkcspy.courses"
  398. ],
  399. "language": "python",
  400. "metadata": {},
  401. "outputs": [
  402. {
  403. "metadata": {},
  404. "output_type": "pyout",
  405. "prompt_number": 56,
  406. "text": [
  407. "[]"
  408. ]
  409. }
  410. ],
  411. "prompt_number": 56
  412. },
  413. {
  414. "cell_type": "code",
  415. "collapsed": false,
  416. "input": [
  417. "query = '''\n",
  418. "SELECT distinct course_id, div_id\n",
  419. "FROM useinfo where event = 'page'\n",
  420. "'''\n",
  421. "df = psql.read_sql(query,conn)"
  422. ],
  423. "language": "python",
  424. "metadata": {},
  425. "outputs": [],
  426. "prompt_number": 57
  427. },
  428. {
  429. "cell_type": "code",
  430. "collapsed": false,
  431. "input": [
  432. "count = 0\n",
  433. "for num in df.index:\n",
  434. " count += 1\n",
  435. " if count % 500000 == 1:\n",
  436. " print(count)\n",
  437. " row = df.iloc[num]\n",
  438. " course = row.course_id\n",
  439. " div = row.div_id\n",
  440. " for word in pythonds.keyWords:\n",
  441. " try:\n",
  442. " if word in div :\n",
  443. " pythonds.addCourse(course)\n",
  444. " except TypeError:\n",
  445. " pass\n",
  446. " for word in thinkcspy.keyWords:\n",
  447. " try:\n",
  448. " if word in div :\n",
  449. " thinkcspy.addCourse(course)\n",
  450. " except TypeError:\n",
  451. " pass"
  452. ],
  453. "language": "python",
  454. "metadata": {},
  455. "outputs": [
  456. {
  457. "output_type": "stream",
  458. "stream": "stdout",
  459. "text": [
  460. "1\n"
  461. ]
  462. }
  463. ],
  464. "prompt_number": 58
  465. },
  466. {
  467. "cell_type": "code",
  468. "collapsed": false,
  469. "input": [
  470. "len(pythonds.courses)"
  471. ],
  472. "language": "python",
  473. "metadata": {},
  474. "outputs": [
  475. {
  476. "metadata": {},
  477. "output_type": "pyout",
  478. "prompt_number": 59,
  479. "text": [
  480. "14"
  481. ]
  482. }
  483. ],
  484. "prompt_number": 59
  485. },
  486. {
  487. "cell_type": "code",
  488. "collapsed": false,
  489. "input": [
  490. "q = '''\n",
  491. "SELECT * FROM courses\n",
  492. "'''\n",
  493. "course_ids = psql.read_sql(q, conn)"
  494. ],
  495. "language": "python",
  496. "metadata": {},
  497. "outputs": [],
  498. "prompt_number": 60
  499. },
  500. {
  501. "cell_type": "heading",
  502. "level": 6,
  503. "metadata": {},
  504. "source": [
  505. "These two lists are the same lists as pythonds.courses and thinkcspy.courses. I simply copied them to save time. "
  506. ]
  507. },
  508. {
  509. "cell_type": "code",
  510. "collapsed": false,
  511. "input": [
  512. "#temp1 = ['pythonds', 'COSC122', 'gatech', 'cs151ranum', 'cs22lbcc', 'cs22-lbcc', 'PaideiaCS2', 'CS136', 'GFU-Algorithms', 'GUDataStructures', 'PythonLevel2', 'luther151', 'devcourse']\n",
  513. "#temp2 = ['thinkcspy', 'aaron-thinkpy', 'IntroCompScience', 'CSC131HUA', 'CSC131004Fall2013', 'STEM_LWSD', 'CSC131tagliarini', 'CS115', 'GalileoAOIT', 'green', '729G04-HT2013', 'ISTA130_FALL_2013', 'FlemingPSP', 'Duke_CompSci101', 'METR1313', 'CS_Python', 'HPComSci_IB', 'CS1-Python', 'PSSD_Python', 'cs150ranum', 'CS30', '152', 'luther150c_fall13', 'SER101F2013', 'scacs', 'IHSPython', 'CS101', 'python_13', 'SMU_CS101', 'CS1140', 'CS140-WCC-F13', 'ISYS217', 'python', 'ICS2O-3C', 'LearnPython', '10ACS', 'Python', 'ITLP-Python', 'csf13f', 'secondp', 'FCHSCP2013', 'wspython', '01418116_56', 'qwerty', 'Thacker', 'IntroProgrammingJHS', 'Griess_Python', 'ParsonsTest', 'ComputerProgramming', 'CSIS101', 'MrEasonECSFall2013', 'CS@WPS', 'CS_20', 'python4lindseya', 'CSC226', 'BC_GCSECompSci', 'lsmsacs244v', 'pyladies', 'CS244', 'ITEC-1150-01-S14', '152Haugrud', 'CSC131tompkinsj', 'ComputerLiteracyPython', 'bcscomputing', 'BigSnake', 'CSC112800', 'testcamp', 'IC-COMP-171-SP-14', 'ProjectNorKam', 'cs110williams', 'CSIS152Spr2014Ficek', 'SUS13', 'SMU_CS106', 'CSIS101W14', 'ANewCourse', 'SBC2014', 'CSGPython2', 'comp123-s14', 'cs150ranumspr2014', 'NewmanSKE2014', 'CS902', 'peeblescourse', 'Spring2014CS104', 'cf_pyintro', 'HTHcomptech', 'PythonASMSA', 'CodingLevel1', 'PythonLevel1', 'CASMTFeb14', 'Pythonintro', 'PythonIntro', 'neintrocs', 'CSIT902', 'CS902Spr14', 'cap-comp101', 'cs-147-s14', 'LASAIntroCS', 'cs902python', 'APCS-Python', 'WCC_CS140_SP14', 'RMBSProgrammingSkills', 'SolinMediaComp', 'csTestPrject', '1314SP_HCT2447', 'ITLP', 'CS101SP14', 'IS1008', 'ProvaAnn77Corso', 'BeensPy', 'python4bio', 'gatech', 'bet_test', 'devcourse', 'pyocc', 'craig-trial', 'CSC112-800', 'comp115', 'unics1140', 'luther150a', 'luther150bc', 'CMPUT 174 Fall 2012', 'CS-1P', 'CMPUT174Fall2012', 'HCHS_GCSE_Python3', 'overview', 'PHBCS', 'HCHS_GCSE_Python', 'COMP201', 'Explorations', 'SKECS2012PythonI', 'SKECS2013PythonI', 'Python101', 'UniOfHertsCS101', 'BMO', 'AMSACS', 'C152_Sum', 'NZTeach', 'IntroCC', 'Summer2013', 'CBCS_CS101', 'PythonWorkshop', 'OCC_CS_A131_FALL_13', 'Python1', 'bic-think-tutorial', 'test_tssi']"
  514. ],
  515. "language": "python",
  516. "metadata": {},
  517. "outputs": [],
  518. "prompt_number": 61
  519. },
  520. {
  521. "cell_type": "heading",
  522. "level": 6,
  523. "metadata": {},
  524. "source": [
  525. "If running anew, replace temp1 and temp2 with pythonds.courses and thinkcspy.courses respectively."
  526. ]
  527. },
  528. {
  529. "cell_type": "code",
  530. "collapsed": false,
  531. "input": [
  532. "f = (lambda x: 'pythonds' if x in pythonds.courses else None or 'thinkcspy' if x in thinkcspy.courses else None)\n",
  533. "course_ids['book'] = course_ids['course_name'].map(f)"
  534. ],
  535. "language": "python",
  536. "metadata": {},
  537. "outputs": [],
  538. "prompt_number": 62
  539. },
  540. {
  541. "cell_type": "code",
  542. "collapsed": false,
  543. "input": [
  544. "course_ids\n"
  545. ],
  546. "language": "python",
  547. "metadata": {},
  548. "outputs": [
  549. {
  550. "html": [
  551. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  552. "<table border=\"1\" class=\"dataframe\">\n",
  553. " <thead>\n",
  554. " <tr style=\"text-align: right;\">\n",
  555. " <th></th>\n",
  556. " <th>id</th>\n",
  557. " <th>course_id</th>\n",
  558. " <th>course_name</th>\n",
  559. " <th>term_start_date</th>\n",
  560. " <th>book</th>\n",
  561. " </tr>\n",
  562. " </thead>\n",
  563. " <tbody>\n",
  564. " <tr>\n",
  565. " <th>0 </th>\n",
  566. " <td> 144</td>\n",
  567. " <td> None</td>\n",
  568. " <td> CSC131tagliarini</td>\n",
  569. " <td> 2013-08-20</td>\n",
  570. " <td> thinkcspy</td>\n",
  571. " </tr>\n",
  572. " <tr>\n",
  573. " <th>1 </th>\n",
  574. " <td> 2</td>\n",
  575. " <td> pythonds</td>\n",
  576. " <td> pythonds</td>\n",
  577. " <td> 2011-01-01</td>\n",
  578. " <td> pythonds</td>\n",
  579. " </tr>\n",
  580. " <tr>\n",
  581. " <th>2 </th>\n",
  582. " <td> 3</td>\n",
  583. " <td> thinkcspy</td>\n",
  584. " <td> thinkcspy</td>\n",
  585. " <td> 2011-01-01</td>\n",
  586. " <td> thinkcspy</td>\n",
  587. " </tr>\n",
  588. " <tr>\n",
  589. " <th>3 </th>\n",
  590. " <td> 139</td>\n",
  591. " <td> None</td>\n",
  592. " <td> HPComSci_IB</td>\n",
  593. " <td> 2013-08-17</td>\n",
  594. " <td> thinkcspy</td>\n",
  595. " </tr>\n",
  596. " <tr>\n",
  597. " <th>4 </th>\n",
  598. " <td> 116</td>\n",
  599. " <td> None</td>\n",
  600. " <td> a</td>\n",
  601. " <td> 2013-08-07</td>\n",
  602. " <td> None</td>\n",
  603. " </tr>\n",
  604. " <tr>\n",
  605. " <th>5 </th>\n",
  606. " <td> 6</td>\n",
  607. " <td> luther151</td>\n",
  608. " <td> luther151</td>\n",
  609. " <td> 2011-01-01</td>\n",
  610. " <td> pythonds</td>\n",
  611. " </tr>\n",
  612. " <tr>\n",
  613. " <th>6 </th>\n",
  614. " <td> 8</td>\n",
  615. " <td> python4bio</td>\n",
  616. " <td> python4bio</td>\n",
  617. " <td> 2011-01-01</td>\n",
  618. " <td> thinkcspy</td>\n",
  619. " </tr>\n",
  620. " <tr>\n",
  621. " <th>7 </th>\n",
  622. " <td> 88</td>\n",
  623. " <td> Python 101</td>\n",
  624. " <td> Python 101</td>\n",
  625. " <td> 2011-01-01</td>\n",
  626. " <td> None</td>\n",
  627. " </tr>\n",
  628. " <tr>\n",
  629. " <th>8 </th>\n",
  630. " <td> 9</td>\n",
  631. " <td> BeensPy</td>\n",
  632. " <td> BeensPy</td>\n",
  633. " <td> 2011-01-01</td>\n",
  634. " <td> thinkcspy</td>\n",
  635. " </tr>\n",
  636. " <tr>\n",
  637. " <th>9 </th>\n",
  638. " <td> 10</td>\n",
  639. " <td> pyocc</td>\n",
  640. " <td> pyocc</td>\n",
  641. " <td> 2011-01-01</td>\n",
  642. " <td> thinkcspy</td>\n",
  643. " </tr>\n",
  644. " <tr>\n",
  645. " <th>10 </th>\n",
  646. " <td> 11</td>\n",
  647. " <td> bet_test</td>\n",
  648. " <td> bet_test</td>\n",
  649. " <td> 2011-01-01</td>\n",
  650. " <td> thinkcspy</td>\n",
  651. " </tr>\n",
  652. " <tr>\n",
  653. " <th>11 </th>\n",
  654. " <td> 117</td>\n",
  655. " <td> None</td>\n",
  656. " <td> Moffdawg</td>\n",
  657. " <td> 2014-02-01</td>\n",
  658. " <td> thinkcspy</td>\n",
  659. " </tr>\n",
  660. " <tr>\n",
  661. " <th>12 </th>\n",
  662. " <td> 119</td>\n",
  663. " <td> None</td>\n",
  664. " <td> web2py</td>\n",
  665. " <td> 2013-08-12</td>\n",
  666. " <td> None</td>\n",
  667. " </tr>\n",
  668. " <tr>\n",
  669. " <th>13 </th>\n",
  670. " <td> 7</td>\n",
  671. " <td> COSC122</td>\n",
  672. " <td> COSC122</td>\n",
  673. " <td> 2013-07-07</td>\n",
  674. " <td> pythonds</td>\n",
  675. " </tr>\n",
  676. " <tr>\n",
  677. " <th>14 </th>\n",
  678. " <td> 118</td>\n",
  679. " <td> None</td>\n",
  680. " <td> STEM 10th Grade Computer Programming</td>\n",
  681. " <td> 2013-08-07</td>\n",
  682. " <td> None</td>\n",
  683. " </tr>\n",
  684. " <tr>\n",
  685. " <th>15 </th>\n",
  686. " <td> 13</td>\n",
  687. " <td> CS 115</td>\n",
  688. " <td> CS 115</td>\n",
  689. " <td> 2011-01-01</td>\n",
  690. " <td> None</td>\n",
  691. " </tr>\n",
  692. " <tr>\n",
  693. " <th>16 </th>\n",
  694. " <td> 14</td>\n",
  695. " <td> CS115</td>\n",
  696. " <td> CS115</td>\n",
  697. " <td> 2011-01-01</td>\n",
  698. " <td> thinkcspy</td>\n",
  699. " </tr>\n",
  700. " <tr>\n",
  701. " <th>17 </th>\n",
  702. " <td> 15</td>\n",
  703. " <td> NorKam Programming With Python</td>\n",
  704. " <td> NorKam Programming With Python</td>\n",
  705. " <td> 2011-01-01</td>\n",
  706. " <td> None</td>\n",
  707. " </tr>\n",
  708. " <tr>\n",
  709. " <th>18 </th>\n",
  710. " <td> 4</td>\n",
  711. " <td> gatech</td>\n",
  712. " <td> gatech</td>\n",
  713. " <td> 2013-08-09</td>\n",
  714. " <td> None</td>\n",
  715. " </tr>\n",
  716. " <tr>\n",
  717. " <th>19 </th>\n",
  718. " <td> 120</td>\n",
  719. " <td> None</td>\n",
  720. " <td> Web2py sem Segredos</td>\n",
  721. " <td> 2013-08-10</td>\n",
  722. " <td> None</td>\n",
  723. " </tr>\n",
  724. " <tr>\n",
  725. " <th>20 </th>\n",
  726. " <td> 121</td>\n",
  727. " <td> None</td>\n",
  728. " <td> Prj1</td>\n",
  729. " <td> 2013-08-10</td>\n",
  730. " <td> None</td>\n",
  731. " </tr>\n",
  732. " <tr>\n",
  733. " <th>21 </th>\n",
  734. " <td> 122</td>\n",
  735. " <td> None</td>\n",
  736. " <td> test</td>\n",
  737. " <td> 2013-08-10</td>\n",
  738. " <td> None</td>\n",
  739. " </tr>\n",
  740. " <tr>\n",
  741. " <th>22 </th>\n",
  742. " <td> 124</td>\n",
  743. " <td> None</td>\n",
  744. " <td> green</td>\n",
  745. " <td> 2013-08-12</td>\n",
  746. " <td> thinkcspy</td>\n",
  747. " </tr>\n",
  748. " <tr>\n",
  749. " <th>23 </th>\n",
  750. " <td> 16</td>\n",
  751. " <td> wm_cs20</td>\n",
  752. " <td> wm_cs20</td>\n",
  753. " <td> 2011-01-01</td>\n",
  754. " <td> None</td>\n",
  755. " </tr>\n",
  756. " <tr>\n",
  757. " <th>24 </th>\n",
  758. " <td> 151</td>\n",
  759. " <td> None</td>\n",
  760. " <td> algopython</td>\n",
  761. " <td> 2013-08-23</td>\n",
  762. " <td> None</td>\n",
  763. " </tr>\n",
  764. " <tr>\n",
  765. " <th>25 </th>\n",
  766. " <td> 125</td>\n",
  767. " <td> None</td>\n",
  768. " <td> AppliedGenomeInformatics</td>\n",
  769. " <td> 2014-09-01</td>\n",
  770. " <td> None</td>\n",
  771. " </tr>\n",
  772. " <tr>\n",
  773. " <th>26 </th>\n",
  774. " <td> 126</td>\n",
  775. " <td> None</td>\n",
  776. " <td> AppliedGenomeInformatics2</td>\n",
  777. " <td> 2014-09-01</td>\n",
  778. " <td> None</td>\n",
  779. " </tr>\n",
  780. " <tr>\n",
  781. " <th>27 </th>\n",
  782. " <td> 17</td>\n",
  783. " <td> craig-trial</td>\n",
  784. " <td> craig-trial</td>\n",
  785. " <td> 2011-01-01</td>\n",
  786. " <td> None</td>\n",
  787. " </tr>\n",
  788. " <tr>\n",
  789. " <th>28 </th>\n",
  790. " <td> 152</td>\n",
  791. " <td> None</td>\n",
  792. " <td> none</td>\n",
  793. " <td> 2013-08-23</td>\n",
  794. " <td> None</td>\n",
  795. " </tr>\n",
  796. " <tr>\n",
  797. " <th>29 </th>\n",
  798. " <td> 128</td>\n",
  799. " <td> None</td>\n",
  800. " <td> ISTA130_FALL_2013</td>\n",
  801. " <td> 2013-08-26</td>\n",
  802. " <td> thinkcspy</td>\n",
  803. " </tr>\n",
  804. " <tr>\n",
  805. " <th>...</th>\n",
  806. " <td>...</td>\n",
  807. " <td>...</td>\n",
  808. " <td>...</td>\n",
  809. " <td>...</td>\n",
  810. " <td>...</td>\n",
  811. " </tr>\n",
  812. " <tr>\n",
  813. " <th>380</th>\n",
  814. " <td> 334</td>\n",
  815. " <td> None</td>\n",
  816. " <td> Spring2014CS104</td>\n",
  817. " <td> 2014-02-01</td>\n",
  818. " <td> thinkcspy</td>\n",
  819. " </tr>\n",
  820. " <tr>\n",
  821. " <th>381</th>\n",
  822. " <td> 470</td>\n",
  823. " <td> None</td>\n",
  824. " <td> Choice</td>\n",
  825. " <td> 2014-06-02</td>\n",
  826. " <td> None</td>\n",
  827. " </tr>\n",
  828. " <tr>\n",
  829. " <th>382</th>\n",
  830. " <td> 499</td>\n",
  831. " <td> None</td>\n",
  832. " <td> sci30x</td>\n",
  833. " <td> 2014-06-19</td>\n",
  834. " <td> None</td>\n",
  835. " </tr>\n",
  836. " <tr>\n",
  837. " <th>383</th>\n",
  838. " <td> 500</td>\n",
  839. " <td> None</td>\n",
  840. " <td> sci30x2</td>\n",
  841. " <td> 2014-12-11</td>\n",
  842. " <td> None</td>\n",
  843. " </tr>\n",
  844. " <tr>\n",
  845. " <th>384</th>\n",
  846. " <td> 501</td>\n",
  847. " <td> None</td>\n",
  848. " <td> Decidability</td>\n",
  849. " <td> 2014-06-19</td>\n",
  850. " <td> None</td>\n",
  851. " </tr>\n",
  852. " <tr>\n",
  853. " <th>385</th>\n",
  854. " <td> 471</td>\n",
  855. " <td> None</td>\n",
  856. " <td> choice</td>\n",
  857. " <td> 2014-05-27</td>\n",
  858. " <td> None</td>\n",
  859. " </tr>\n",
  860. " <tr>\n",
  861. " <th>386</th>\n",
  862. " <td> 475</td>\n",
  863. " <td> None</td>\n",
  864. " <td> bnmtest91</td>\n",
  865. " <td> 2014-05-28</td>\n",
  866. " <td> None</td>\n",
  867. " </tr>\n",
  868. " <tr>\n",
  869. " <th>387</th>\n",
  870. " <td> 477</td>\n",
  871. " <td> None</td>\n",
  872. " <td> bnmtest92</td>\n",
  873. " <td> 2014-05-28</td>\n",
  874. " <td> None</td>\n",
  875. " </tr>\n",
  876. " <tr>\n",
  877. " <th>388</th>\n",
  878. " <td> 476</td>\n",
  879. " <td> None</td>\n",
  880. " <td> CS104Fall2014</td>\n",
  881. " <td> 2014-09-02</td>\n",
  882. " <td> None</td>\n",
  883. " </tr>\n",
  884. " <tr>\n",
  885. " <th>389</th>\n",
  886. " <td> 478</td>\n",
  887. " <td> None</td>\n",
  888. " <td> CCPS_Python</td>\n",
  889. " <td> 2014-05-29</td>\n",
  890. " <td> None</td>\n",
  891. " </tr>\n",
  892. " <tr>\n",
  893. " <th>390</th>\n",
  894. " <td> 479</td>\n",
  895. " <td> None</td>\n",
  896. " <td> ru_python</td>\n",
  897. " <td> 2014-06-01</td>\n",
  898. " <td> None</td>\n",
  899. " </tr>\n",
  900. " <tr>\n",
  901. " <th>391</th>\n",
  902. " <td> 481</td>\n",
  903. " <td> None</td>\n",
  904. " <td> TestCourse131234</td>\n",
  905. " <td> 2014-06-04</td>\n",
  906. " <td> None</td>\n",
  907. " </tr>\n",
  908. " <tr>\n",
  909. " <th>392</th>\n",
  910. " <td> 482</td>\n",
  911. " <td> None</td>\n",
  912. " <td> TestCourse131235</td>\n",
  913. " <td> 2014-06-04</td>\n",
  914. " <td> None</td>\n",
  915. " </tr>\n",
  916. " <tr>\n",
  917. " <th>393</th>\n",
  918. " <td> 483</td>\n",
  919. " <td> None</td>\n",
  920. " <td> TestCourse131236</td>\n",
  921. " <td> 2014-06-04</td>\n",
  922. " <td> None</td>\n",
  923. " </tr>\n",
  924. " <tr>\n",
  925. " <th>394</th>\n",
  926. " <td> 484</td>\n",
  927. " <td> None</td>\n",
  928. " <td> NYU-CSCI-UA2-005</td>\n",
  929. " <td> 2014-09-01</td>\n",
  930. " <td> None</td>\n",
  931. " </tr>\n",
  932. " <tr>\n",
  933. " <th>395</th>\n",
  934. " <td> 485</td>\n",
  935. " <td> None</td>\n",
  936. " <td> TestCourse131237</td>\n",
  937. " <td> 2014-06-04</td>\n",
  938. " <td> None</td>\n",
  939. " </tr>\n",
  940. " <tr>\n",
  941. " <th>396</th>\n",
  942. " <td> 486</td>\n",
  943. " <td> None</td>\n",
  944. " <td> lbcc-cs22</td>\n",
  945. " <td> 2014-06-05</td>\n",
  946. " <td> None</td>\n",
  947. " </tr>\n",
  948. " <tr>\n",
  949. " <th>397</th>\n",
  950. " <td> 487</td>\n",
  951. " <td> None</td>\n",
  952. " <td> WormsleyTest</td>\n",
  953. " <td> 2014-06-05</td>\n",
  954. " <td> None</td>\n",
  955. " </tr>\n",
  956. " <tr>\n",
  957. " <th>398</th>\n",
  958. " <td> 488</td>\n",
  959. " <td> None</td>\n",
  960. " <td> Programming1</td>\n",
  961. " <td> 2014-06-06</td>\n",
  962. " <td> None</td>\n",
  963. " </tr>\n",
  964. " <tr>\n",
  965. " <th>399</th>\n",
  966. " <td> 489</td>\n",
  967. " <td> None</td>\n",
  968. " <td> cchspython</td>\n",
  969. " <td> 2014-06-06</td>\n",
  970. " <td> None</td>\n",
  971. " </tr>\n",
  972. " <tr>\n",
  973. " <th>400</th>\n",
  974. " <td> 472</td>\n",
  975. " <td> None</td>\n",
  976. " <td> ChoiceCoding</td>\n",
  977. " <td> 2014-05-27</td>\n",
  978. " <td> None</td>\n",
  979. " </tr>\n",
  980. " <tr>\n",
  981. " <th>401</th>\n",
  982. " <td> 490</td>\n",
  983. " <td> None</td>\n",
  984. " <td> FirenzePasA042</td>\n",
  985. " <td> 2014-06-11</td>\n",
  986. " <td> None</td>\n",
  987. " </tr>\n",
  988. " <tr>\n",
  989. " <th>402</th>\n",
  990. " <td> 491</td>\n",
  991. " <td> None</td>\n",
  992. " <td> tp-Python</td>\n",
  993. " <td> 2014-06-12</td>\n",
  994. " <td> None</td>\n",
  995. " </tr>\n",
  996. " <tr>\n",
  997. " <th>403</th>\n",
  998. " <td> 492</td>\n",
  999. " <td> None</td>\n",
  1000. " <td> Univ-of-Wash-CSE-373-Autumn-2014</td>\n",
  1001. " <td> 2014-09-24</td>\n",
  1002. " <td> None</td>\n",
  1003. " </tr>\n",
  1004. " <tr>\n",
  1005. " <th>404</th>\n",
  1006. " <td> 493</td>\n",
  1007. " <td> None</td>\n",
  1008. " <td> Ordinamentovettori</td>\n",
  1009. " <td> 2014-06-16</td>\n",
  1010. " <td> None</td>\n",
  1011. " </tr>\n",
  1012. " <tr>\n",
  1013. " <th>405</th>\n",
  1014. " <td> 494</td>\n",
  1015. " <td> None</td>\n",
  1016. " <td> CS902SU14</td>\n",
  1017. " <td> 2014-06-16</td>\n",
  1018. " <td> None</td>\n",
  1019. " </tr>\n",
  1020. " <tr>\n",
  1021. " <th>406</th>\n",
  1022. " <td> 495</td>\n",
  1023. " <td> None</td>\n",
  1024. " <td> 123</td>\n",
  1025. " <td> 2014-06-16</td>\n",
  1026. " <td> None</td>\n",
  1027. " </tr>\n",
  1028. " <tr>\n",
  1029. " <th>407</th>\n",
  1030. " <td> 503</td>\n",
  1031. " <td> None</td>\n",
  1032. " <td> ICS3U</td>\n",
  1033. " <td> 2014-06-18</td>\n",
  1034. " <td> None</td>\n",
  1035. " </tr>\n",
  1036. " <tr>\n",
  1037. " <th>408</th>\n",
  1038. " <td> 504</td>\n",
  1039. " <td> None</td>\n",
  1040. " <td> Learn_Pythom</td>\n",
  1041. " <td> 2014-06-20</td>\n",
  1042. " <td> None</td>\n",
  1043. " </tr>\n",
  1044. " <tr>\n",
  1045. " <th>409</th>\n",
  1046. " <td> 507</td>\n",
  1047. " <td> None</td>\n",
  1048. " <td> WestlakeCSP</td>\n",
  1049. " <td> 2014-06-22</td>\n",
  1050. " <td> None</td>\n",
  1051. " </tr>\n",
  1052. " </tbody>\n",
  1053. "</table>\n",
  1054. "<p>410 rows \u00d7 5 columns</p>\n",
  1055. "</div>"
  1056. ],
  1057. "metadata": {},
  1058. "output_type": "pyout",
  1059. "prompt_number": 63,
  1060. "text": [
  1061. " id course_id \\\n",
  1062. "0 144 None \n",
  1063. "1 2 pythonds \n",
  1064. "2 3 thinkcspy \n",
  1065. "3 139 None \n",
  1066. "4 116 None \n",
  1067. "5 6 luther151 \n",
  1068. "6 8 python4bio \n",
  1069. "7 88 Python 101 \n",
  1070. "8 9 BeensPy \n",
  1071. "9 10 pyocc \n",
  1072. "10 11 bet_test \n",
  1073. "11 117 None \n",
  1074. "12 119 None \n",
  1075. "13 7 COSC122 \n",
  1076. "14 118 None \n",
  1077. "15 13 CS 115 \n",
  1078. "16 14 CS115 \n",
  1079. "17 15 NorKam Programming With Python \n",
  1080. "18 4 gatech \n",
  1081. "19 120 None \n",
  1082. "20 121 None \n",
  1083. "21 122 None \n",
  1084. "22 124 None \n",
  1085. "23 16 wm_cs20 \n",
  1086. "24 151 None \n",
  1087. "25 125 None \n",
  1088. "26 126 None \n",
  1089. "27 17 craig-trial \n",
  1090. "28 152 None \n",
  1091. "29 128 None \n",
  1092. ".. ... ... \n",
  1093. "380 334 None \n",
  1094. "381 470 None \n",
  1095. "382 499 None \n",
  1096. "383 500 None \n",
  1097. "384 501 None \n",
  1098. "385 471 None \n",
  1099. "386 475 None \n",
  1100. "387 477 None \n",
  1101. "388 476 None \n",
  1102. "389 478 None \n",
  1103. "390 479 None \n",
  1104. "391 481 None \n",
  1105. "392 482 None \n",
  1106. "393 483 None \n",
  1107. "394 484 None \n",
  1108. "395 485 None \n",
  1109. "396 486 None \n",
  1110. "397 487 None \n",
  1111. "398 488 None \n",
  1112. "399 489 None \n",
  1113. "400 472 None \n",
  1114. "401 490 None \n",
  1115. "402 491 None \n",
  1116. "403 492 None \n",
  1117. "404 493 None \n",
  1118. "405 494 None \n",
  1119. "406 495 None \n",
  1120. "407 503 None \n",
  1121. "408 504 None \n",
  1122. "409 507 None \n",
  1123. "\n",
  1124. " course_name term_start_date book \n",
  1125. "0 CSC131tagliarini 2013-08-20 thinkcspy \n",
  1126. "1 pythonds 2011-01-01 pythonds \n",
  1127. "2 thinkcspy 2011-01-01 thinkcspy \n",
  1128. "3 HPComSci_IB 2013-08-17 thinkcspy \n",
  1129. "4 a 2013-08-07 None \n",
  1130. "5 luther151 2011-01-01 pythonds \n",
  1131. "6 python4bio 2011-01-01 thinkcspy \n",
  1132. "7 Python 101 2011-01-01 None \n",
  1133. "8 BeensPy 2011-01-01 thinkcspy \n",
  1134. "9 pyocc 2011-01-01 thinkcspy \n",
  1135. "10 bet_test 2011-01-01 thinkcspy \n",
  1136. "11 Moffdawg 2014-02-01 thinkcspy \n",
  1137. "12 web2py 2013-08-12 None \n",
  1138. "13 COSC122 2013-07-07 pythonds \n",
  1139. "14 STEM 10th Grade Computer Programming 2013-08-07 None \n",
  1140. "15 CS 115 2011-01-01 None \n",
  1141. "16 CS115 2011-01-01 thinkcspy \n",
  1142. "17 NorKam Programming With Python 2011-01-01 None \n",
  1143. "18 gatech 2013-08-09 None \n",
  1144. "19 Web2py sem Segredos 2013-08-10 None \n",
  1145. "20 Prj1 2013-08-10 None \n",
  1146. "21 test 2013-08-10 None \n",
  1147. "22 green 2013-08-12 thinkcspy \n",
  1148. "23 wm_cs20 2011-01-01 None \n",
  1149. "24 algopython 2013-08-23 None \n",
  1150. "25 AppliedGenomeInformatics 2014-09-01 None \n",
  1151. "26 AppliedGenomeInformatics2 2014-09-01 None \n",
  1152. "27 craig-trial 2011-01-01 None \n",
  1153. "28 none 2013-08-23 None \n",
  1154. "29 ISTA130_FALL_2013 2013-08-26 thinkcspy \n",
  1155. ".. ... ... ... \n",
  1156. "380 Spring2014CS104 2014-02-01 thinkcspy \n",
  1157. "381 Choice 2014-06-02 None \n",
  1158. "382 sci30x 2014-06-19 None \n",
  1159. "383 sci30x2 2014-12-11 None \n",
  1160. "384 Decidability 2014-06-19 None \n",
  1161. "385 choice 2014-05-27 None \n",
  1162. "386 bnmtest91 2014-05-28 None \n",
  1163. "387 bnmtest92 2014-05-28 None \n",
  1164. "388 CS104Fall2014 2014-09-02 None \n",
  1165. "389 CCPS_Python 2014-05-29 None \n",
  1166. "390 ru_python 2014-06-01 None \n",
  1167. "391 TestCourse131234 2014-06-04 None \n",
  1168. "392 TestCourse131235 2014-06-04 None \n",
  1169. "393 TestCourse131236 2014-06-04 None \n",
  1170. "394 NYU-CSCI-UA2-005 2014-09-01 None \n",
  1171. "395 TestCourse131237 2014-06-04 None \n",
  1172. "396 lbcc-cs22 2014-06-05 None \n",
  1173. "397 WormsleyTest 2014-06-05 None \n",
  1174. "398 Programming1 2014-06-06 None \n",
  1175. "399 cchspython 2014-06-06 None \n",
  1176. "400 ChoiceCoding 2014-05-27 None \n",
  1177. "401 FirenzePasA042 2014-06-11 None \n",
  1178. "402 tp-Python 2014-06-12 None \n",
  1179. "403 Univ-of-Wash-CSE-373-Autumn-2014 2014-09-24 None \n",
  1180. "404 Ordinamentovettori 2014-06-16 None \n",
  1181. "405 CS902SU14 2014-06-16 None \n",
  1182. "406 123 2014-06-16 None \n",
  1183. "407 ICS3U 2014-06-18 None \n",
  1184. "408 Learn_Pythom 2014-06-20 None \n",
  1185. "409 WestlakeCSP 2014-06-22 None \n",
  1186. "\n",
  1187. "[410 rows x 5 columns]"
  1188. ]
  1189. }
  1190. ],
  1191. "prompt_number": 63
  1192. },
  1193. {
  1194. "cell_type": "code",
  1195. "collapsed": false,
  1196. "input": [
  1197. "course_ids.to_sql('bookcourses',engine2,if_exists='replace')"
  1198. ],
  1199. "language": "python",
  1200. "metadata": {},
  1201. "outputs": [],
  1202. "prompt_number": 64
  1203. },
  1204. {
  1205. "cell_type": "heading",
  1206. "level": 6,
  1207. "metadata": {},
  1208. "source": [
  1209. "Removal starts here"
  1210. ]
  1211. },
  1212. {
  1213. "cell_type": "code",
  1214. "collapsed": false,
  1215. "input": [
  1216. "%%sql\n",
  1217. "CREATE INDEX bookIndex\n",
  1218. "ON bookcourses (index);\n",
  1219. "\n",
  1220. "COMMIT;"
  1221. ],
  1222. "language": "python",
  1223. "metadata": {},
  1224. "outputs": [
  1225. {
  1226. "output_type": "stream",
  1227. "stream": "stdout",
  1228. "text": [
  1229. "Done.\n",
  1230. "Done.\n"
  1231. ]
  1232. },
  1233. {
  1234. "metadata": {},
  1235. "output_type": "pyout",
  1236. "prompt_number": 65,
  1237. "text": [
  1238. "[]"
  1239. ]
  1240. }
  1241. ],
  1242. "prompt_number": 65
  1243. },
  1244. {
  1245. "cell_type": "code",
  1246. "collapsed": false,
  1247. "input": [
  1248. "query = '''\n",
  1249. "SELECT book, course_id ,clicks,numDivs,numSids FROM\n",
  1250. "(\n",
  1251. "SELECT book, course_name\n",
  1252. "FROM bookcourses\n",
  1253. ") AS S\n",
  1254. "JOIN\n",
  1255. "(SELECT course_id, COUNT(*) AS clicks, COUNT(DISTINCT div_id) AS numDivs, COUNT(DISTINCT sid) AS numSids\n",
  1256. "FROM useinfo\n",
  1257. "GROUP BY course_id\n",
  1258. ") AS T\n",
  1259. "ON (S.course_name = T.course_id)\n",
  1260. "'''\n",
  1261. "\n",
  1262. "course_idInfo = psql.read_sql(query, conn)"
  1263. ],
  1264. "language": "python",
  1265. "metadata": {},
  1266. "outputs": [],
  1267. "prompt_number": 66
  1268. },
  1269. {
  1270. "cell_type": "code",
  1271. "collapsed": false,
  1272. "input": [
  1273. "#Collect all of the courses that don't have a book associated with them\n",
  1274. "noBook = course_idInfo[course_idInfo['book'] != 'pythonds']\n",
  1275. "noBook = noBook[noBook['book'] != 'thinkcspy']"
  1276. ],
  1277. "language": "python",
  1278. "metadata": {},
  1279. "outputs": [],
  1280. "prompt_number": 67
  1281. },
  1282. {
  1283. "cell_type": "code",
  1284. "collapsed": false,
  1285. "input": [
  1286. "df = course_idInfo\n",
  1287. "df1 = df[df['numdivs'] < 5]\n",
  1288. "df2 = df[df['numsids'] < 5]\n",
  1289. "df3 = df[df['clicks']<100]\n",
  1290. "df4 = pd.merge(df1,df2,how='outer')\n",
  1291. "df5 = pd.merge(df3,df4,how='outer')"
  1292. ],
  1293. "language": "python",
  1294. "metadata": {},
  1295. "outputs": [],
  1296. "prompt_number": 68
  1297. },
  1298. {
  1299. "cell_type": "code",
  1300. "collapsed": false,
  1301. "input": [
  1302. "remove = pd.merge(df5,noBook, how = 'outer')['course_id']\n"
  1303. ],
  1304. "language": "python",
  1305. "metadata": {},
  1306. "outputs": [],
  1307. "prompt_number": 69
  1308. },
  1309. {
  1310. "cell_type": "code",
  1311. "collapsed": false,
  1312. "input": [
  1313. "remove"
  1314. ],
  1315. "language": "python",
  1316. "metadata": {},
  1317. "outputs": [
  1318. {
  1319. "metadata": {},
  1320. "output_type": "pyout",
  1321. "prompt_number": 70,
  1322. "text": [
  1323. "0 9my0\n",
  1324. "1 ACcomp11\n",
  1325. "2 APCS-Python\n",
  1326. "3 AppliedGenomeInformatics2\n",
  1327. "4 BASE\n",
  1328. "5 BC_GCSECompSci\n",
  1329. "6 C152_Sum\n",
  1330. "7 CBCS\n",
  1331. "8 CS140\n",
  1332. "9 CSC100_J_term\n",
  1333. "10 CSIS152\n",
  1334. "11 CSIT902\n",
  1335. "12 CS_Python\n",
  1336. "13 CodingLevel1\n",
  1337. "14 DAM_00\n",
  1338. "...\n",
  1339. "75 septtest1\n",
  1340. "76 stolaf121AS14\n",
  1341. "77 tc_1\n",
  1342. "78 test_tssi\n",
  1343. "79 testcamp\n",
  1344. "80 testproject\n",
  1345. "81 IntroCC\n",
  1346. "82 MrEasonECSFall2013\n",
  1347. "83 NZTeach\n",
  1348. "84 PythonASMSA\n",
  1349. "85 SKECS2012PythonI\n",
  1350. "86 WPSCS\n",
  1351. "87 JavaReview\n",
  1352. "88 Learning\n",
  1353. "89 SER101F2013\n",
  1354. "Name: course_id, Length: 90, dtype: object"
  1355. ]
  1356. }
  1357. ],
  1358. "prompt_number": 70
  1359. },
  1360. {
  1361. "cell_type": "code",
  1362. "collapsed": false,
  1363. "input": [
  1364. "#This is for the clean database notebook. After making a table of the removed items, I'll remove those course_ids from useinfo \n",
  1365. "remove.to_sql('outlier_courses',engine2,if_exists='replace')"
  1366. ],
  1367. "language": "python",
  1368. "metadata": {},
  1369. "outputs": [],
  1370. "prompt_number": 71
  1371. },
  1372. {
  1373. "cell_type": "code",
  1374. "collapsed": false,
  1375. "input": [
  1376. "%%sql\n",
  1377. "CREATE INDEX ourlier_course_Index\n",
  1378. "ON outlier_courses (index);\n",
  1379. "\n",
  1380. "COMMIT;"
  1381. ],
  1382. "language": "python",
  1383. "metadata": {},
  1384. "outputs": [
  1385. {
  1386. "output_type": "stream",
  1387. "stream": "stdout",
  1388. "text": [
  1389. "Done.\n",
  1390. "Done.\n"
  1391. ]
  1392. },
  1393. {
  1394. "metadata": {},
  1395. "output_type": "pyout",
  1396. "prompt_number": 72,
  1397. "text": [
  1398. "[]"
  1399. ]
  1400. }
  1401. ],
  1402. "prompt_number": 72
  1403. },
  1404. {
  1405. "cell_type": "code",
  1406. "collapsed": false,
  1407. "input": [
  1408. "%%sql\n",
  1409. "DELETE FROM useinfo\n",
  1410. "WHERE course_id IN \n",
  1411. "(SELECT course_id FROM outlier_courses);\n",
  1412. "COMMIT;"
  1413. ],
  1414. "language": "python",
  1415. "metadata": {},
  1416. "outputs": [
  1417. {
  1418. "output_type": "stream",
  1419. "stream": "stdout",
  1420. "text": [
  1421. "24827 rows affected.\n",
  1422. "Done.\n"
  1423. ]
  1424. },
  1425. {
  1426. "metadata": {},
  1427. "output_type": "pyout",
  1428. "prompt_number": 73,
  1429. "text": [
  1430. "[]"
  1431. ]
  1432. }
  1433. ],
  1434. "prompt_number": 73
  1435. },
  1436. {
  1437. "cell_type": "heading",
  1438. "level": 3,
  1439. "metadata": {},
  1440. "source": [
  1441. "Delete Single Div Users from the database"
  1442. ]
  1443. },
  1444. {
  1445. "cell_type": "heading",
  1446. "level": 5,
  1447. "metadata": {},
  1448. "source": [
  1449. "Pythonds"
  1450. ]
  1451. },
  1452. {
  1453. "cell_type": "code",
  1454. "collapsed": false,
  1455. "input": [
  1456. "%%sql\n",
  1457. "DELETE FROM useinfo WHERE id IN\n",
  1458. "(\n",
  1459. "SELECT useinfo.id FROM useinfo JOIN bookcourses\n",
  1460. " ON (useinfo.course_id = bookcourses.course_name)\n",
  1461. "WHERE sid IN\n",
  1462. "(\n",
  1463. "SELECT sid\n",
  1464. "FROM \n",
  1465. "(\n",
  1466. " SELECT DISTINCT sid,div_id\n",
  1467. " FROM useinfo JOIN bookcourses\n",
  1468. " ON (useinfo.course_id = bookcourses.course_name)\n",
  1469. " WHERE book = 'pythonds'\n",
  1470. ") AS foo\n",
  1471. "GROUP BY sid HAVING count(*) = 1\n",
  1472. ")\n",
  1473. "AND book = 'pythonds');\n",
  1474. "\n",
  1475. "COMMIT;"
  1476. ],
  1477. "language": "python",
  1478. "metadata": {},
  1479. "outputs": [
  1480. {
  1481. "output_type": "stream",
  1482. "stream": "stdout",
  1483. "text": [
  1484. "432951 rows affected.\n",
  1485. "Done.\n"
  1486. ]
  1487. },
  1488. {
  1489. "metadata": {},
  1490. "output_type": "pyout",
  1491. "prompt_number": 74,
  1492. "text": [
  1493. "[]"
  1494. ]
  1495. }
  1496. ],
  1497. "prompt_number": 74
  1498. },
  1499. {
  1500. "cell_type": "heading",
  1501. "level": 5,
  1502. "metadata": {},
  1503. "source": [
  1504. "Thinkcspy"
  1505. ]
  1506. },
  1507. {
  1508. "cell_type": "code",
  1509. "collapsed": false,
  1510. "input": [
  1511. "%%sql\n",
  1512. "DELETE FROM useinfo WHERE id IN\n",
  1513. "(\n",
  1514. "SELECT useinfo.id FROM useinfo JOIN bookcourses\n",
  1515. " ON (useinfo.course_id = bookcourses.course_name)\n",
  1516. "WHERE sid IN\n",
  1517. "(\n",
  1518. "SELECT sid\n",
  1519. "FROM \n",
  1520. "(\n",
  1521. " SELECT DISTINCT sid,div_id\n",
  1522. " FROM useinfo JOIN bookcourses\n",
  1523. " ON (useinfo.course_id = bookcourses.course_name)\n",
  1524. " WHERE book = 'thinkcspy'\n",
  1525. ") AS foo\n",
  1526. "GROUP BY sid HAVING count(*) = 1\n",
  1527. ")\n",
  1528. "AND book = 'thinkcspy');\n",
  1529. "\n",
  1530. "COMMIT;"
  1531. ],
  1532. "language": "python",
  1533. "metadata": {},
  1534. "outputs": [
  1535. {
  1536. "output_type": "stream",
  1537. "stream": "stdout",
  1538. "text": [
  1539. "520677 rows affected.\n",
  1540. "Done.\n"
  1541. ]
  1542. },
  1543. {
  1544. "metadata": {},
  1545. "output_type": "pyout",
  1546. "prompt_number": 75,
  1547. "text": [
  1548. "[]"
  1549. ]
  1550. }
  1551. ],
  1552. "prompt_number": 75
  1553. },
  1554. {
  1555. "cell_type": "heading",
  1556. "level": 3,
  1557. "metadata": {},
  1558. "source": [
  1559. "Delete Short Term Users from the database"
  1560. ]
  1561. },
  1562. {
  1563. "cell_type": "code",
  1564. "collapsed": false,
  1565. "input": [
  1566. "%%sql\n",
  1567. "DELETE FROM useinfo\n",
  1568. "WHERE sid IN\n",
  1569. "(\n",
  1570. "--#Select all users whose duration on the website was less than an hour \n",
  1571. " SELECT sid FROM\n",
  1572. " \n",
  1573. " (\n",
  1574. " SELECT sid,timestamp,event,act,div_id,course_id \n",
  1575. " FROM \n",
  1576. " (\n",
  1577. " SELECT sid FROM useinfo\n",
  1578. " GROUP BY sid HAVING COUNT(*) > 1\n",
  1579. " ) AS T \n",
  1580. " NATURAL JOIN \n",
  1581. " (\n",
  1582. " SELECT sid,min(timestamp) startd, max(timestamp) endd FROM useinfo\n",
  1583. " GROUP BY sid\n",
  1584. " ) AS S \n",
  1585. " NATURAL JOIN \n",
  1586. " useinfo\n",
  1587. " WHERE (endd - startd) < ('1 day')::interval\n",
  1588. " ) AS mvu\n",
  1589. ")\n",
  1590. ";\n",
  1591. "COMMIT;\n",
  1592. " \n",
  1593. " \n"
  1594. ],
  1595. "language": "python",
  1596. "metadata": {},
  1597. "outputs": [
  1598. {
  1599. "output_type": "stream",
  1600. "stream": "stdout",
  1601. "text": [
  1602. "3164790 rows affected.\n",
  1603. "Done.\n"
  1604. ]
  1605. },
  1606. {
  1607. "metadata": {},
  1608. "output_type": "pyout",
  1609. "prompt_number": 76,
  1610. "text": [
  1611. "[]"
  1612. ]
  1613. }
  1614. ],
  1615. "prompt_number": 76
  1616. },
  1617. {
  1618. "cell_type": "code",
  1619. "collapsed": false,
  1620. "input": [
  1621. " %%sql\n",
  1622. "DELETE FROM useinfo\n",
  1623. "WHERE sid IN\n",
  1624. "(\n",
  1625. " SELECT sid FROM\n",
  1626. "--#Select all the users with only one action. \n",
  1627. " (\n",
  1628. " SELECT sid,timestamp,event,act,div_id,course_id FROM \n",
  1629. " (\n",
  1630. " SELECT sid FROM useinfo GROUP BY sid HAVING COUNT(*) < 2\n",
  1631. " ) AS R \n",
  1632. " NATURAL JOIN \n",
  1633. " useinfo\n",
  1634. " \n",
  1635. " ) AS svu\n",
  1636. " \n",
  1637. ")\n",
  1638. ";\n",
  1639. "\n",
  1640. "COMMIT;"
  1641. ],
  1642. "language": "python",
  1643. "metadata": {},
  1644. "outputs": [
  1645. {
  1646. "output_type": "stream",
  1647. "stream": "stdout",
  1648. "text": [
  1649. "10451 rows affected.\n",
  1650. "Done.\n"
  1651. ]
  1652. },
  1653. {
  1654. "metadata": {},
  1655. "output_type": "pyout",
  1656. "prompt_number": 77,
  1657. "text": [
  1658. "[]"
  1659. ]
  1660. }
  1661. ],
  1662. "prompt_number": 77
  1663. },
  1664. {
  1665. "cell_type": "code",
  1666. "collapsed": false,
  1667. "input": [],
  1668. "language": "python",
  1669. "metadata": {},
  1670. "outputs": [],
  1671. "prompt_number": 77
  1672. },
  1673. {
  1674. "cell_type": "heading",
  1675. "level": 3,
  1676. "metadata": {},
  1677. "source": [
  1678. "Remove outliers with respect to clicks and duration."
  1679. ]
  1680. },
  1681. {
  1682. "cell_type": "code",
  1683. "collapsed": false,
  1684. "input": [
  1685. "query = ''' \n",
  1686. "SELECT sid, min(timestamp) startd, max(timestamp) endd\n",
  1687. "FROM useinfo\n",
  1688. "GROUP BY sid\n",
  1689. "'''\n",
  1690. "\n",
  1691. "d = psql.frame_query(query,conn)\n",
  1692. "d.head()"
  1693. ],
  1694. "language": "python",
  1695. "metadata": {},
  1696. "outputs": [
  1697. {
  1698. "html": [
  1699. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  1700. "<table border=\"1\" class=\"dataframe\">\n",
  1701. " <thead>\n",
  1702. " <tr style=\"text-align: right;\">\n",
  1703. " <th></th>\n",
  1704. " <th>sid</th>\n",
  1705. " <th>startd</th>\n",
  1706. " <th>endd</th>\n",
  1707. " </tr>\n",
  1708. " </thead>\n",
  1709. " <tbody>\n",
  1710. " <tr>\n",
  1711. " <th>0</th>\n",
  1712. " <td> bcselvitazurkevica</td>\n",
  1713. " <td>2012-10-02 08:15:40</td>\n",
  1714. " <td>2014-02-11 21:10:49</td>\n",
  1715. " </tr>\n",
  1716. " <tr>\n",
  1717. " <th>1</th>\n",
  1718. " <td> jPayne</td>\n",
  1719. " <td>2013-09-10 15:43:59</td>\n",
  1720. " <td>2013-09-29 20:55:51</td>\n",
  1721. " </tr>\n",
  1722. " <tr>\n",
  1723. " <th>2</th>\n",
  1724. " <td> lisadawn</td>\n",
  1725. " <td>2013-08-21 21:21:58</td>\n",
  1726. " <td>2013-08-26 19:49:02</td>\n",
  1727. " </tr>\n",
  1728. " <tr>\n",
  1729. " <th>3</th>\n",
  1730. " <td> mackedee</td>\n",
  1731. " <td>2013-08-27 23:46:49</td>\n",
  1732. " <td>2013-09-10 01:17:01</td>\n",
  1733. " </tr>\n",
  1734. " <tr>\n",
  1735. " <th>4</th>\n",
  1736. " <td> +4dB</td>\n",
  1737. " <td>2013-07-14 01:20:04</td>\n",
  1738. " <td>2013-10-30 00:11:56</td>\n",
  1739. " </tr>\n",
  1740. " </tbody>\n",
  1741. "</table>\n",
  1742. "</div>"
  1743. ],
  1744. "metadata": {},
  1745. "output_type": "pyout",
  1746. "prompt_number": 78,
  1747. "text": [
  1748. " sid startd endd\n",
  1749. "0 bcselvitazurkevica 2012-10-02 08:15:40 2014-02-11 21:10:49\n",
  1750. "1 jPayne 2013-09-10 15:43:59 2013-09-29 20:55:51\n",
  1751. "2 lisadawn 2013-08-21 21:21:58 2013-08-26 19:49:02\n",
  1752. "3 mackedee 2013-08-27 23:46:49 2013-09-10 01:17:01\n",
  1753. "4 +4dB 2013-07-14 01:20:04 2013-10-30 00:11:56"
  1754. ]
  1755. }
  1756. ],
  1757. "prompt_number": 78
  1758. },
  1759. {
  1760. "cell_type": "code",
  1761. "collapsed": false,
  1762. "input": [
  1763. "d['duration'] = d['endd'] - d['startd']\n",
  1764. "d['durdays'] = d['duration'].map(lambda x: x.astype('timedelta64[D]')) #takes a function as a parameter and"
  1765. ],
  1766. "language": "python",
  1767. "metadata": {},
  1768. "outputs": [],
  1769. "prompt_number": 79
  1770. },
  1771. {
  1772. "cell_type": "code",
  1773. "collapsed": false,
  1774. "input": [
  1775. "df = d[d.duration > np.timedelta64(1,'D')]\n",
  1776. "#df = df[df.sid != 'bmiller']\n",
  1777. "#df = df[df.sid != 'ranum' ]\n",
  1778. "df.index = df.sid"
  1779. ],
  1780. "language": "python",
  1781. "metadata": {},
  1782. "outputs": [],
  1783. "prompt_number": 80
  1784. },
  1785. {
  1786. "cell_type": "code",
  1787. "collapsed": false,
  1788. "input": [
  1789. "q = '''\n",
  1790. "SELECT sid, COUNT(*) AS numvisits FROM useinfo\n",
  1791. "GROUP BY sid \n",
  1792. "ORDER BY numvisits\n",
  1793. "'''\n",
  1794. "visitf = psql.frame_query(q,conn)\n",
  1795. "visitf.index = visitf.sid"
  1796. ],
  1797. "language": "python",
  1798. "metadata": {},
  1799. "outputs": [],
  1800. "prompt_number": 81
  1801. },
  1802. {
  1803. "cell_type": "code",
  1804. "collapsed": false,
  1805. "input": [
  1806. "dfr = df['sid'].values\n",
  1807. "dfrs = set(dfr)"
  1808. ],
  1809. "language": "python",
  1810. "metadata": {},
  1811. "outputs": [],
  1812. "prompt_number": 82
  1813. },
  1814. {
  1815. "cell_type": "code",
  1816. "collapsed": false,
  1817. "input": [
  1818. "count = 1\n",
  1819. "pairs = []\n",
  1820. "for u in df.index:\n",
  1821. " if u in visitf.index:\n",
  1822. " count += 1\n",
  1823. " pairs.append((int(df.ix[u].durdays)/1000000000/(60*60*24),visitf.ix[u].numvisits, df.ix[u].sid ))"
  1824. ],
  1825. "language": "python",
  1826. "metadata": {},
  1827. "outputs": [],
  1828. "prompt_number": 83
  1829. },
  1830. {
  1831. "cell_type": "code",
  1832. "collapsed": false,
  1833. "input": [
  1834. "\n",
  1835. "pf = DataFrame( {'duration':[x[0] for x in pairs], 'clicks':[x[1] for x in pairs],'sid': [x[2] for x in pairs] } )\n"
  1836. ],
  1837. "language": "python",
  1838. "metadata": {},
  1839. "outputs": [],
  1840. "prompt_number": 84
  1841. },
  1842. {
  1843. "cell_type": "code",
  1844. "collapsed": false,
  1845. "input": [
  1846. "def getOutliers(dataframe, dimension):\n",
  1847. " mean = dataframe[dimension].mean()\n",
  1848. " std = dataframe[dimension].std() \n",
  1849. "\n",
  1850. " newDF = dataframe[(dataframe[dimension] - mean)/std < 3.5]\n",
  1851. " return newDF\n",
  1852. " print(mean, \" \" , std)"
  1853. ],
  1854. "language": "python",
  1855. "metadata": {},
  1856. "outputs": [],
  1857. "prompt_number": 85
  1858. },
  1859. {
  1860. "cell_type": "code",
  1861. "collapsed": false,
  1862. "input": [
  1863. "temp = getOutliers(pf, 'clicks') #Finds outliers with respects to clicks\n",
  1864. "temp1 = getOutliers(pf, 'duration') #Finds the rest of the outliers with respect to duration. "
  1865. ],
  1866. "language": "python",
  1867. "metadata": {},
  1868. "outputs": [],
  1869. "prompt_number": 86
  1870. },
  1871. {
  1872. "cell_type": "code",
  1873. "collapsed": false,
  1874. "input": [
  1875. "temp1.head()"
  1876. ],
  1877. "language": "python",
  1878. "metadata": {},
  1879. "outputs": [
  1880. {
  1881. "html": [
  1882. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  1883. "<table border=\"1\" class=\"dataframe\">\n",
  1884. " <thead>\n",
  1885. " <tr style=\"text-align: right;\">\n",
  1886. " <th></th>\n",
  1887. " <th>clicks</th>\n",
  1888. " <th>duration</th>\n",
  1889. " <th>sid</th>\n",
  1890. " </tr>\n",
  1891. " </thead>\n",
  1892. " <tbody>\n",
  1893. " <tr>\n",
  1894. " <th>1</th>\n",
  1895. " <td> 66</td>\n",
  1896. " <td> 19</td>\n",
  1897. " <td> jPayne</td>\n",
  1898. " </tr>\n",
  1899. " <tr>\n",
  1900. " <th>2</th>\n",
  1901. " <td> 544</td>\n",
  1902. " <td> 4</td>\n",
  1903. " <td> lisadawn</td>\n",
  1904. " </tr>\n",
  1905. " <tr>\n",
  1906. " <th>3</th>\n",
  1907. " <td> 209</td>\n",
  1908. " <td> 13</td>\n",
  1909. " <td> mackedee</td>\n",
  1910. " </tr>\n",
  1911. " <tr>\n",
  1912. " <th>4</th>\n",
  1913. " <td> 171</td>\n",
  1914. " <td> 107</td>\n",
  1915. " <td> +4dB</td>\n",
  1916. " </tr>\n",
  1917. " <tr>\n",
  1918. " <th>5</th>\n",
  1919. " <td> 1563</td>\n",
  1920. " <td> 111</td>\n",
  1921. " <td> 0v3r2</td>\n",
  1922. " </tr>\n",
  1923. " </tbody>\n",
  1924. "</table>\n",
  1925. "</div>"
  1926. ],
  1927. "metadata": {},
  1928. "output_type": "pyout",
  1929. "prompt_number": 87,
  1930. "text": [
  1931. " clicks duration sid\n",
  1932. "1 66 19 jPayne\n",
  1933. "2 544 4 lisadawn\n",
  1934. "3 209 13 mackedee\n",
  1935. "4 171 107 +4dB\n",
  1936. "5 1563 111 0v3r2"
  1937. ]
  1938. }
  1939. ],
  1940. "prompt_number": 87
  1941. },
  1942. {
  1943. "cell_type": "code",
  1944. "collapsed": false,
  1945. "input": [
  1946. "outliers = pd.merge(temp,temp1, how ='outer', on ='sid')\n",
  1947. "temp1.to_sql('outlierssid',engine2,if_exists='replace')"
  1948. ],
  1949. "language": "python",
  1950. "metadata": {},
  1951. "outputs": [],
  1952. "prompt_number": 88
  1953. },
  1954. {
  1955. "cell_type": "code",
  1956. "collapsed": false,
  1957. "input": [
  1958. "%%sql\n",
  1959. "CREATE INDEX outlierIndex\n",
  1960. "ON outlierssid (sid);\n",
  1961. "\n",
  1962. "COMMIT;"
  1963. ],
  1964. "language": "python",
  1965. "metadata": {},
  1966. "outputs": [
  1967. {
  1968. "output_type": "stream",
  1969. "stream": "stdout",
  1970. "text": [
  1971. "Done.\n",
  1972. "Done.\n"
  1973. ]
  1974. },
  1975. {
  1976. "metadata": {},
  1977. "output_type": "pyout",
  1978. "prompt_number": 89,
  1979. "text": [
  1980. "[]"
  1981. ]
  1982. }
  1983. ],
  1984. "prompt_number": 89
  1985. },
  1986. {
  1987. "cell_type": "code",
  1988. "collapsed": false,
  1989. "input": [
  1990. "%%sql\n",
  1991. "DELETE FROM useinfo\n",
  1992. "WHERE sid IN \n",
  1993. " (SELECT sid \n",
  1994. " FROM useinfo\n",
  1995. " NATURAL JOIN \n",
  1996. " (\n",
  1997. " SELECT sid \n",
  1998. " FROM useinfo\n",
  1999. " EXCEPT \n",
  2000. " (SELECT sid \n",
  2001. " FROM outlierssid\n",
  2002. " )\n",
  2003. " ) \n",
  2004. " AS T\n",
  2005. " );\n",
  2006. "COMMIT;"
  2007. ],
  2008. "language": "python",
  2009. "metadata": {},
  2010. "outputs": [
  2011. {
  2012. "output_type": "stream",
  2013. "stream": "stdout",
  2014. "text": [
  2015. "628996 rows affected.\n",
  2016. "Done.\n"
  2017. ]
  2018. },
  2019. {
  2020. "metadata": {},
  2021. "output_type": "pyout",
  2022. "prompt_number": 90,
  2023. "text": [
  2024. "[]"
  2025. ]
  2026. }
  2027. ],
  2028. "prompt_number": 90
  2029. },
  2030. {
  2031. "cell_type": "code",
  2032. "collapsed": false,
  2033. "input": [
  2034. "%%sql\n",
  2035. "SELECT count(*) FROM useinfo;"
  2036. ],
  2037. "language": "python",
  2038. "metadata": {},
  2039. "outputs": [
  2040. {
  2041. "output_type": "stream",
  2042. "stream": "stdout",
  2043. "text": [
  2044. "1 rows affected.\n"
  2045. ]
  2046. },
  2047. {
  2048. "html": [
  2049. "<table>\n",
  2050. " <tr>\n",
  2051. " <th>count</th>\n",
  2052. " </tr>\n",
  2053. " <tr>\n",
  2054. " <td>9357764</td>\n",
  2055. " </tr>\n",
  2056. "</table>"
  2057. ],
  2058. "metadata": {},
  2059. "output_type": "pyout",
  2060. "prompt_number": 91,
  2061. "text": [
  2062. "[(9357764,)]"
  2063. ]
  2064. }
  2065. ],
  2066. "prompt_number": 91
  2067. },
  2068. {
  2069. "cell_type": "heading",
  2070. "level": 3,
  2071. "metadata": {},
  2072. "source": [
  2073. "Replace courselib with runestone"
  2074. ]
  2075. },
  2076. {
  2077. "cell_type": "code",
  2078. "collapsed": false,
  2079. "input": [
  2080. "%%sql\n",
  2081. "UPDATE useinfo\n",
  2082. "SET div_id = regexp_replace(div_id,'^/courselib(.*)','/runestone\\1');\n",
  2083. "\n",
  2084. "COMMIT;"
  2085. ],
  2086. "language": "python",
  2087. "metadata": {},
  2088. "outputs": [
  2089. {
  2090. "output_type": "stream",
  2091. "stream": "stdout",
  2092. "text": [
  2093. "9357764 rows affected.\n",
  2094. "Done.\n"
  2095. ]
  2096. },
  2097. {
  2098. "metadata": {},
  2099. "output_type": "pyout",
  2100. "prompt_number": 92,
  2101. "text": [
  2102. "[]"
  2103. ]
  2104. }
  2105. ],
  2106. "prompt_number": 92
  2107. },
  2108. {
  2109. "cell_type": "heading",
  2110. "level": 3,
  2111. "metadata": {},
  2112. "source": [
  2113. "Replace mChoice with mchoice"
  2114. ]
  2115. },
  2116. {
  2117. "cell_type": "code",
  2118. "collapsed": false,
  2119. "input": [
  2120. "%%sql \n",
  2121. "UPDATE useinfo\n",
  2122. "SET event = 'mchoice'\n",
  2123. "WHERE event = 'mChoice';\n",
  2124. "\n",
  2125. "COMMIT;"
  2126. ],
  2127. "language": "python",
  2128. "metadata": {},
  2129. "outputs": [
  2130. {
  2131. "output_type": "stream",
  2132. "stream": "stdout",
  2133. "text": [
  2134. "1166988 rows affected.\n",
  2135. "Done.\n"
  2136. ]
  2137. },
  2138. {
  2139. "metadata": {},
  2140. "output_type": "pyout",
  2141. "prompt_number": 93,
  2142. "text": [
  2143. "[]"
  2144. ]
  2145. }
  2146. ],
  2147. "prompt_number": 93
  2148. },
  2149. {
  2150. "cell_type": "code",
  2151. "collapsed": false,
  2152. "input": [
  2153. "%%sql\n",
  2154. "SELECT * FROM useinfo\n",
  2155. "where event = 'mChoice';"
  2156. ],
  2157. "language": "python",
  2158. "metadata": {},
  2159. "outputs": [
  2160. {
  2161. "output_type": "stream",
  2162. "stream": "stdout",
  2163. "text": [
  2164. "0 rows affected.\n"
  2165. ]
  2166. },
  2167. {
  2168. "html": [
  2169. "<table>\n",
  2170. " <tr>\n",
  2171. " <th>id</th>\n",
  2172. " <th>timestamp</th>\n",
  2173. " <th>sid</th>\n",
  2174. " <th>event</th>\n",
  2175. " <th>act</th>\n",
  2176. " <th>div_id</th>\n",
  2177. " <th>course_id</th>\n",
  2178. " </tr>\n",
  2179. "</table>"
  2180. ],
  2181. "metadata": {},
  2182. "output_type": "pyout",
  2183. "prompt_number": 94,
  2184. "text": [
  2185. "[]"
  2186. ]
  2187. }
  2188. ],
  2189. "prompt_number": 94
  2190. },
  2191. {
  2192. "cell_type": "code",
  2193. "collapsed": false,
  2194. "input": [
  2195. "%%sql\n",
  2196. "SELECT count(*) FROM (\n",
  2197. "SELECT DISTINCT sid FROM useinfo) AS t;"
  2198. ],
  2199. "language": "python",
  2200. "metadata": {},
  2201. "outputs": [
  2202. {
  2203. "output_type": "stream",
  2204. "stream": "stdout",
  2205. "text": [
  2206. "1 rows affected.\n"
  2207. ]
  2208. },
  2209. {
  2210. "html": [
  2211. "<table>\n",
  2212. " <tr>\n",
  2213. " <th>count</th>\n",
  2214. " </tr>\n",
  2215. " <tr>\n",
  2216. " <td>81894</td>\n",
  2217. " </tr>\n",
  2218. "</table>"
  2219. ],
  2220. "metadata": {},
  2221. "output_type": "pyout",
  2222. "prompt_number": 95,
  2223. "text": [
  2224. "[(81894,)]"
  2225. ]
  2226. }
  2227. ],
  2228. "prompt_number": 95
  2229. },
  2230. {
  2231. "cell_type": "code",
  2232. "collapsed": false,
  2233. "input": [
  2234. "%%sql\n",
  2235. "SELECT count(*) FROM useinfo;"
  2236. ],
  2237. "language": "python",
  2238. "metadata": {},
  2239. "outputs": [
  2240. {
  2241. "output_type": "stream",
  2242. "stream": "stdout",
  2243. "text": [
  2244. "1 rows affected.\n"
  2245. ]
  2246. },
  2247. {
  2248. "html": [
  2249. "<table>\n",
  2250. " <tr>\n",
  2251. " <th>count</th>\n",
  2252. " </tr>\n",
  2253. " <tr>\n",
  2254. " <td>9357764</td>\n",
  2255. " </tr>\n",
  2256. "</table>"
  2257. ],
  2258. "metadata": {},
  2259. "output_type": "pyout",
  2260. "prompt_number": 96,
  2261. "text": [
  2262. "[(9357764,)]"
  2263. ]
  2264. }
  2265. ],
  2266. "prompt_number": 96
  2267. },
  2268. {
  2269. "cell_type": "code",
  2270. "collapsed": false,
  2271. "input": [],
  2272. "language": "python",
  2273. "metadata": {},
  2274. "outputs": [],
  2275. "prompt_number": 96
  2276. }
  2277. ],
  2278. "metadata": {}
  2279. }
  2280. ]
  2281. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement