Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "metadata": {
- "name": "",
- "signature": "sha256:962bc8e068d054a027f1195730bba2115cb820266aabc57ae123f73d5294650f"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Data Cleanup Notebook\n",
- "\n",
- "Some thoughts before I get started with this. \n",
- " \n",
- " * Start by removing short term users. Recalculate for repeatability\n",
- " \n",
- " * Remove blank SIDs, instructors, and test courses (gatech,devcourse, and lightly used courses) \n",
- " \n",
- " * Remove nontraditional pages (those not begining with courselib or runestone \n",
- " \n",
- " * Remove unofficial div_ids\n",
- " \n",
- " * Remove outliers. Recalculate them for repeatability\n",
- " \n",
- " * Replace runestone with courselib\n",
- " \n",
- " * Replace mChoice with mchoice for consistency\n",
- " \n",
- " \n",
- "THINGS YOU NEED BEFORE RUNNING THIS NOTEBOOK:\n",
- "\n",
- "Tables:\n",
- " * useinfo (or something similar)\n",
- " * courses\n",
- " \n",
- "Python Scripts:\n",
- " * bookParse.py\n",
- " * getDivs.py\n"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "import getDivs\n",
- "import bookParse as bp\n",
- "from getInstructors import getInstructors\n",
- "import re\n",
- "import pandas as pd\n",
- "import psycopg2\n",
- "import socket\n",
- "import numpy as np\n",
- "import pandas.io.sql as psql\n",
- "from pandas import DataFrame\n",
- "from sqlalchemy import create_engine\n",
- "\n",
- "db_to_clean = 'bmiller'\n",
- "db_user = 'bmiller'\n",
- "db_pass = ''\n",
- "\n",
- "if db_pass:\n",
- " engine2 = create_engine('postgresql+psycopg2://%s:%s@localhost/%s' % (db_user, db_pass, db_to_clean))\n",
- " conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user,password=db_pass)\n",
- "else:\n",
- " engine2 = create_engine('postgresql+psycopg2://%s@localhost/%s' % (db_user, db_to_clean))\n",
- " conn = psycopg2.connect(host=\"localhost\",database=db_to_clean,user=db_user)\n",
- "\n",
- "connection2 = engine2.connect()\n",
- "\n",
- "\n",
- "%load_ext sql \n",
- "%sql postgresql://bmiller@localhost/bmiller"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "The sql extension is already loaded. To reload it, use:\n",
- " %reload_ext sql\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 47,
- "text": [
- "'Connected: bmiller@bmiller'"
- ]
- }
- ],
- "prompt_number": 47
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Remove non-traditional pages, blank sids, and instructors"
- ]
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "Before running the following query, make sure to insert the most recent list of instructors."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE (event = 'page' AND NOT (div_id ~ '^/(courselib|runestone).*')) --#non-traditional pages\n",
- "OR sid = '' --#blank sids\n",
- "OR sid IN ('DaveCousins', '01418116', 'sbcmharrison', 'mrcpittway', 'paolobianconi', 'OrganizeFISH', 'brekke', 'haugrudm', 'fnurl', 'Martin Highmore', 'gisc', 'jstephens', 'pguse', 'klaidley', 'Nobula', 'ivipul', 'dgyanek', 'Nicholas Seward', 'asahny', 'chorse', 'rschulz', 'nedoluzhko', 'sara', 'memedu', 'swbcteach', 'bula', 'pbeens', 'wmp', 'potterwalt', 'walkermsum', 'acsizmadia', 'the.krup', 'the_bob_student', 'the_bob', 'jgreenawalt', 'Mcintoshchs', 'gkondrak', 'a.reid', 'mckeownp', 'timbellnz', 'quintin', 'rahmani', 'tiverson', 'gccktaira', 'vtn2', 'bmiller', 'rosew', 'jbschafer', 'keen', 'sgilbert@occ.cccd.edu', 'as2653', 'RKumar', 'dcaswell', 'dzolzer', 'gvrinten', 'minsu', 'mzmacky2', 'ktaira', 'rkowaney', 'WarrenPettitt', 'tompkins', 'tompkinsJack', 'HullSmurf001', 'tagliarinig', 'CSC131tompkins', 'vellinga', 'pearcej', 'rambasnet', 'mdogy', 'cmurakami', 'ficekr', 'ficek', 'wilf.lendrum', 'shlomor', 'jazcap', 'dlovell', 'alter718', 'polarlemniscate', 'acbart', 'pmisterovich', 'CheeseTurtle', 'mr.octavio', 'mirita81', 'shyampurk', 'robertcduvall', 'ludaesch', 'rpugh', 'khelson1', 'emcpPYTHON', 'horacers', 'JohnDoorenbos', 'dunfeer', 'techplex.engineer', 'misa.tesi', 'kwhillans', 'dr_bwilson', 'r70smith', 'tand', 'bschormann', 'traveltcg', 'mridgway', 'hhtsimpson', 'bmoening', 'mhilland', 'pam.meier', 'tdragon@ithaca.edu', 'bokhariz', 'MrYantho', 'ine5201', 'avunque', 'rasika', 'dtgreen', 'camingus', 'hogue', 'tstrat', 'stormeb', 'lwchadd', 'xty.dsouza', 'elevel', 'ThaddeusAid', 'davidUrr', 'hitoshi', 'mark.schmitt', 'eallatta', 'mfitzpatrick', 'vcowal', 'salihtuncer', 'warrenpettitt', 'pylearner', 'william.levi.collins', 'Barrie', 'Jacob777', 'vfxpraveen117', 'aoteatech', 'nask.meerpaal', 'jsolin', 'd.giordan', 'tmoffdawg', 'scmreason', 'Mr_Gray', 'case.andrew', 'trampgeek', 'niroshseven', 'decadous', 'MrBate', 'stcrispin', 'mieke.gorissen', 'englert.ervin', 'ericsonga', 'mithron', 'numodica', 'Mathad19', 'JenksW', 'LonnekeDriessen', 'kwaljee', 'larry', 'pentrium', 'moffdawg', 'maxmori77', 'SandeepShelke', 'davelai', 'tommrrtt', 'spock_tpol', 'ekeoid', 'BantamJoe', 'alter718_2', 'MegW', 'elanus', 'cn.gheorghetiteica', 'yoyo_available', 'sdmcgee', 'rdotsch', 'gjvoigt', 'gmacmanus', 'mharrison', 'jfemiani', 'femiani', 'kwright', 'dekrwright', 'jenniferchoffmann', 'mrschellenberg', 'The Salty Seaman', 'shane.dicks1', 'mleblanc', 'ProfNorman', 'metrpy', 'gregmax', 'ironwk85', 'hannutam', 'andrewgtp', 'realblurgh', 'madhavjha', 'acase', 'mchen53', 'jgilgen', 'ofenerci', 'JackTompkins', 'mdeakyne', 'spsofficers', 'comqsph', 'SteveTanimoto', 'jiffp', 'rkumar', 'Cmcguire', 'jheckelsmiller', 'OvidioMarinho', 'dianneoc', 'AppylPye', 'cwormsl2', 'kidbookrev', 'bic', 'afseTestAccount', 'vivek.lpc', '10081453', 'kaspermunch', 'supamrchen', 'johnkershaw', 'mesri', 'jfall', 'cheriegilmore', 'cewing', 'beth-mac', 'mcraig', 'jspacco', 'ranum', 'bcooper', 'xuxiaomin19892', 'cryptogoth', 'rowzer', 'allen', 'isaacdl', 'james adams', 'sciberteach', 'perry.fizzano', 'Green1313', 'cmheisel', 'wavemind', 'steven_king', 'assault', 'jgb', 'komo', 'gjenkins@lbcc.edu', 'FEdericoSOmaschini', 'david236', 'zarial01', 'miguser1', 'kev_jones', 'JuYoungKim', 'garyphillips', 'jasonlanda', 'kfkennedy', 'holcma01', 'jrincon@emsofl.com', 'johnpeeb', 'eudaimonious', 'VineetaSingh', 'rmueller', 'kash41', 'rtindell', 'mike.farfaraway', 'vilipold', 'weihongw', 'tonisvahi', 'rafaja', 'aliev', 'zacharyblackwood', 'smckennitt', 'drtech', 'bnminstruct', 'elliott', 'mleblanc@wheatoncollege.edu', 'CameronA', 'nicokie', 'volcan', 'matthewvenn', 'ttttamaki', 'Terry.Pasley@kctcs.edu', 'mpearse', 'gabrinerd', 'danschellenberg', 'alnitak')\n",
- "--#list of instructors\n",
- ";\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "97999 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 48,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 48
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Remove known test courses. "
- ]
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "Currently limited to just gatch and devcourse. Will revise when known test courses has been updated."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE course_id IN ('gatech', 'devcourse','overview');\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "106732 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 49,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 49
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Remove unofficial div_ids"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "pythondsDivInfo = getDivs.getDivInfo('pythonds')\n",
- "pythondsDivs = pythondsDivInfo['div_id']\n",
- "str(pythondsDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 50,
- "text": [
- "\"['active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode']\""
- ]
- }
- ],
- "prompt_number": 50
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE id in \n",
- "(\n",
- "SELECT useinfo.id FROM \n",
- "useinfo\n",
- "JOIN bookcourses\n",
- "ON (useinfo.course_id = bookcourses.course_name)\n",
- "WHERE \n",
- " book = 'pythonds' \n",
- " AND event NOT IN ('page','highlight','ac_error')\n",
- " AND div_id NOT IN ('active5', 'active6', 'active7', 'analysis_1', 'analysis_2', 'analysis_3', 'findMinVid', 'mcpyperform', 'mcpydictperf', 'pythonopsperf', 'active1', 'active2', 'active3', 'divby2', 'baseconvert', 'baseconvert1', 'baseconvert2', 'baseconvert3', 'video_Stack2', 'deqtest', 'ququeuetest', 'queue_1', 'stack_1ac', 'stack_ex_1', 'stack_cl_1', 'stack_1', 'stack_2', 'stack_stringrev', 'stack1_video', 'orderedlistclass', 'unorderedlistcomplete', 'self_check_list1', 'self_check_list2', 'intopost', 'postfixeval', 'postfix1', 'postfix2', 'postfix3', 'video_Stack3', 'palchecker', 'parcheck1', 'qujosephussim', 'qumainsim', 'print_sim_selfcheck', 'intro_8', 'self_check_1', 'list_unique', 'self_check_2', 'listcomp', 'monkeyvid', 'intro_1', 'intro_2', 'intro_3', 'intro_5', 'intro_7', 'scratch_01_01', 'strstuff', 'gcd_cl', 'fraction_class', 'self_check_4', 'fraction', 'complete_cuircuit', 'self_check_5', 'logicgates', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'HASH_1', 'HASH_2', 'hashtablecomplete', 'search3', 'search4', 'BSRCH_1', 'BSRCH_2', 'lst_bubble', 'bubble_anim', 'bubbletrace', 'lst_shortbubble', 'shortbubbletrace', 'question_sort_1', 'lst_insertion', 'insertion_anim', 'insertionsortcodetrace', 'question_sort_3', 'lst_merge', 'merge_anim', 'mergetrace', 'question_sort_5', 'question_sort_6', 'lst_quick', 'quick_anim', 'quicktrace', 'question_sort_7', 'question_sort_8', 'question_sort_9', 'lst_selectionsortcode', 'selection_anim', 'selectionsortcodetrace', 'question_sort_2', 'search1', 'search2', 'question_SRCH_1', 'question_SRCH_2', 'lst_shellSort', 'shell_anim', 'shellSorttrace', 'question_sort_4', 'completeheap', 'heap1', 'tree_list1', 'bin_tree', 'mctree_1', 'mctree_2', 'bintree', 'mctree_3', 'parsebuild', 'bst_1', 'completebstcode')\n",
- ");\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "31112 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 51,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 51
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "thinkcspyDivInfo = getDivs.getDivInfo('thinkcspy')\n",
- "thinkcspyDivs = thinkcspyDivInfo['div_id']\n",
- "str(thinkcspyDivs) #NOTE: I formatted as a string so I could copy and paste it more easily. Uncomment to get the list"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 52,
- "text": [
- "\"['chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2']\""
- ]
- }
- ],
- "prompt_number": 52
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE id IN\n",
- "(\n",
- "SELECT useinfo.id FROM useinfo\n",
- "JOIN bookcourses\n",
- "ON (useinfo.course_id = bookcourses.course_name)\n",
- "WHERE \n",
- " book = 'thinkcspy' \n",
- " AND event NOT IN ('page','highlight','ac_error')\n",
- " AND div_id NOT IN ('chp13_classes4', 'chp13_classes5', 'chp13_classesstr1', 'chp13_classesstr2', 'ch_cl_ex_1_answer', 'ch_cl_02', 'ch_cl_ex_3_answer', 'ch_cl_04', 'ch_cl_05_answer', 'chp13_improveconstructor', 'chp13_classesmid1', 'scratch_cl_01', 'chp13_classes6', 'chp13_classes1', 'chp13_points', 'chp13_classes2', 'fractions_add1', 'scratch_cl_02\\\\t', 'ch_cl2_answer1', 'ch_cl2_q2 ', 'ch_cl2_q3answer', 'ch_cl2_q4', 'ch_cl2_q5answer', 'ch_cl2_answer7', 'ch_cl2_q8', 'fractions_init', 'fractions_gcd', 'fractions_simplify', 'fractions_is', 'fractions_eq1', 'db_ex3_1', 'db_ex3_2', 'db_q_ex3_1', 'db_ex3_3', 'db_ex3_4', 'db_ex3_5', 'db_qex32', 'db_ex3_6', 'db_ex3_7', 'db_ex3_8', 'db_ex3_9', 'db_ex3_10', 'db_ex3_11', 'db_ex3_12', 'ch12_dict11', 'test_question11_4_1', 'chp12_dict6', 'chp12_dict7', 'chp12_dict8', 'chp12_dict9', 'chp12_dict10', 'scratch_11_02', 'test_question11_3_1', 'test_question11_3_2', 'test_question11_3_3', 'test_question11_3_4', 'test_question11_3_5', 'ch12_dict4', 'ch12_dict4a', 'ch12_dict5', 'test_question11_2_1', 'ex_11_01', 'q1_answer', 'ex_11_02', 'ex_11_03', 'ex_11_04', 'ch11_q5_answer', 'chp12_sparse', 'chp12_dict1', 'chp12_dict2', 'chp12_dict3', 'scratch_11_01', 'test_question11_1_1 ', 'test_question11_1_2', 'files_while', 'ex_6_1', 'ch_files_q1answer', 'ex_10_2', 'ex_6_3', 'ch_files_q3answer', 'ex_10_4', 'ex_10_5', 'ch_files_q5answer', 'files_for', 'files_write01', 'ch05_barchart', 'scratch_05_06', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_newarea', 'test_question5_6_1', 'test_question5_6_2', 'sumofsquares', 'ch04_3', 'ch04_4', 'ch04_5', 'ch04_6', 'ch04_square', 'ch04_clsquare', 'ch04_clsquare_bad', 'test_question5_2_1', 'test_question5_2_2', 'ch06_distance1', 'ch06_distancefinal', 'function_accumulator_pattern', 'sq_accum1', 'sq_accum3', 'scratch_05_04', 'test_question5_4_1', 'question5_4_1p', 'bad_local', 'badsquare_1', 'cl_powerof_bad', 'cl_change_parm', 'test_question5_3_1', 'test_question5_3_2', 'test_question5_3_3', 'function_intro', 'ch04_1', 'ch04_1a', 'ch04_2', 'scratch_05_01', 'test_question5_1_1', 'test_question5_1_2', 'test_question5_1_3', 'test_question5_1_4', 'test_question5_1_5', 'test_question5_1_6', 'test_question5_1_7', 'ch01_2', 'question1_11_1', 'question1_1_1', 'question1_1_2', 'ch01_3', 'question1_12_1', 'scratch_01', 'question1_9_1', 'question1_10_1', 'question1_10_2', 'question1_4_1', 'question1_7_1', 'question1_8_1', 'codelensvid', 'ch01_1', 'firstexample', 'question1_3_1', 'question1_3_2', 'question1_6_1', 'question1_6_2', 'question1_2_1', 'question1_2_2', 'question1_2_3', 'question1_5_1', 'astlab_ast1', 'lab01_1', 'lab01_2', 'l0401', 'lab0401a', 'lab0401aa', 'chp12_single', 'piguessstart', 'seq3nlab1', 'sin1', 'sin2', 'sinlab1', 'sinlab2', 'chp09_02', 'test_question9_3_1', 'test_question9_3_2', 'test_question9_3_3', 'listalias1', 'chp09_is3', 'test_question9_10_1', 'appcon1', 'appcon2', 'appcon3', 'test_question9_15_1', 'chp09_is4', 'chp09_5', 'chp09_concatid', 'test_question9_5_1', 'test_question9_5_2', 'ex_9_2', 'ex_9_3', 'q3_answer', 'ex_9_4', 'ex_9_5', 'q5_answer', 'ex_7_11', 'ex_9_6', 'q7_answer', 'ex_9_7', 'ex_9_8', 'q9_answer', 'ex_9_9', 'ex_9_10', 'q11_answer', 'ex_9_11', 'ex_9_12', 'q13_answer', 'ex_9_13', 'ex_9_14', 'q15_answer', 'listcomp1', 'scratch_09_06', 'test_question9_20_1', 'ch09_11', 'scratch_09_01', 'chp09_01a', 'test_question9_2_1 ', 'test_question9_2_2', 'chp09_4', 'test_question9_4_1', 'test_question9_4_2', 'chp09_meth1', 'chp09_meth2', 'test_question9_13_1', 'test_question9_13_2', 'test_question9_13_3', 'test_question9_13_4', 'scratch_09_03', 'chp09_6', 'test_question9_6_1', 'chp09_01', 'test_question9_1_1 ', 'chp09_03a', 'chp09_03b', 'chp09_for3', 'chp09_for4', 'test_question9_16_1', 'ch09_7', 'item_assign', 'ch09_8', 'ch09_9', 'ch09_10', 'test_question9_7_1', 'chp09_nest', 'test_question9_21_1', 'chp09_is1', 'chp09_is2', 'chp09_istrace', 'ch09_mod2', 'ch09_mod3', 'repref1', 'repref2', 'repref3', 'reprefstep', 'test_question9_12_1', 'test_question9_12_2', 'ch09_split1', 'ch09_split2', 'ch09_join', 'test_question9_22_1', 'list_lsys1', 'list_lsys2', 'ch09_tuple1', 'chp09_tuple2', 'chp09_tuple3', 'scratch_09_07', 'chp09_parm1', 'chp09_parm1_trace', 'ch09_list1', 'pixelex1a', 'test_question7_8_1_1', 'pixelex1', 'test_question7_8_2_1', 'nested1', 'nested2', 'acimg_1', 'test_question7_8_3_1', 'test_question7_8_3_2', 'scratch_07_03', 'ex_7_7', 'q1_answer', 'ex_7_8', 'ex_7_9', 'q3_answer', 'ex_7_14', 'ex_7_13', 'q5_answer', 'ex_7_12', 'ex_7_15', 'q7_answer', 'ex_7_16', 'ex_7_17', 'q9_answer', 'ex_7_18', 'ex_7_19', 'answer_7_11', 'ex_7_20', 'ex_7_21', 'q13_answer', 'ex_7_22', 'ex_7_23', 'q15_answer', 'scratch_07_05', 'chp07_newtonsdef', 'chp07_newtonswhile', 'iter_randwalk1', 'iter_randwalk2', 'test_question7_3_1', 'test_question7_3_2', 'ch07_table1', 'test_question7_7_1', 'ch07_indef1', 'test_question7_4_1', 'ch07_for1', 'ch07_summation', 'whileloop', 'ch07_while1', 'ch07_while2', 'scratch_07_01', 'test_question7_2_1', 'test_question7_2_2', 'mod_q1_answer', 'ex_mod_2', 'mod_q3_answer', 'ex_mod_4', 'randmodvid', 'chmodule_02', 'question4_2_1', 'chmodule_rand', 'chmodule_rand2', 'question4_4_1', 'question4_4_2', 'question4_4_3', 'question4_4_4', 'scratch_04', 'inputvid', 'chmod_01', 'question4_1_1', 'question4_1_2', 'question4_1_3', 'ch03_7', '3_10', '3_11', 'ex_3_1', 'q1_answer', 'ex_3_2', 'ex_3_3', 'q3_answer', 'ex_3_4', 'ex_3_5', 'ex_3_6', 'ex_3_7', 'q7_answer', 'ex_3_8', 'ex_3_9', 'q9_answer', 'ex_3_10', 'ex_3_11', 'q11_answer', 'ex_3_12', 'ex_3_13', 'q13_answer', 'vtest', 'ch03_3', 'test_question3_2_1', '3_6', '3_7', 'ch03_for1', 'ch03_forcolor', 'colorlist', '3_8', '3_9', 'test_question3_4_1', 'test_question3_4_2', 'test_question3_4_3', 'test_question3_4_4', 'ch03_1', 'test_question3_1_0', '3_1', '3_2', '3_3', 'ch03_2', 'test_question3_1_1', 'test_question3_1_2', 'test_question3_1_3', 'test_question3_1_4', '3_4', '3_5', 'scratch_03', 'forloopvid', 'ch03_4', 'advrange', 'ch03_5', 'rangeme', 'ch03_6', 'rangeme2', 'test_question3_5_1', 'test_question3_5_2', 'test_question3_5_3', 'assignvid', 'lst_itsum', 'lst_recsum', 'lst_rectostr', 'recursion_sc_1', 'recursion_sc_2', 'chp12_koch', 'lst_change2', 'lst_dpremember', 'completemaze', 'chp11_recursivesum', 'recursion_scratch_1', 'lst_st', 'lst_recstack', 'question_recsimp_1', 'question_recsimp_2 ', 'hanoi', 'lst_turt1', 'lst_complete_tree', 'recursion_sc_3', 'ch06_boolfun1', 'ch06_boolfun2', 'ch06_boolcodelens', 'test_question6_8_1', 'test_question6_8_2', 'scratch_06_03', 'booleanexpressions', 'ch05_1', 'ch05_1a', 'ch05_2', 'test_question6_1_1', 'sel4', 'scratch_06_02', 'test_question6_7_1', 'test_question6_7_2', 'binaryselection', 'ch05_4', 'test_question6_4_1', 'test_question6_4_2', 'test_question6_4_3', 'ex_6_1', 'ex_6_2', 'ex_6_3', 'q3_question', 'ex_6_4', 'ex_6_5', 'answer_ex_6_5', 'ex_6_6', 'ex_6_7', 'q7_answer', 'ex_6_8', 'ex_6_9', 'q9_answer', 'ex_6_10', 'ex_6_11', 'q11_answer', 'ex_6_12', 'ex_6_13', 'answer_ex_6_13', 'chp05_3', 'test_question6_2_1', 'sel2', 'sel1', 'test_question6_6_1', 'unaryselection', 'ch05_unaryselection', 'test_question6_5_1', 'test_question6_5_2', 'scratch_06_01', 'test_question6_3_1', 'ch02_ex1', 'ex_2_2', 'ex_2_3', 'q3_answer', 'ex_2_4', 'ex_2_5', 'q5_answer ', 'ex_2_6', 'ex_2_7', 'q7_answer', 'ex_2_8', 'ex_2_9', 'q9_answer', 'ex_2_10', 'ex_2_11', 'q11_answer', 'ex_2_12', 'inputvid', 'inputfun', 'int_secs', 'test_question2_7_1', 'ch02_15', 'ch02_16', 'ch02_17', 'ch02_18', 'ch02_19_codelens', 'test_question2_6_1', 'test_question2_6_2', 'test_question2_6_3', 'precedencevid', 'associativityvid', 'ch02_23', 'test_question2_8_1', 'test_question2_8_2', 'reassignmentvid', 'ch07_reassign1', 'ch07_reassign2', 'test_question2_9_1', 'expression_vid', 'ch02_13', 'ch02_14', 'ch02_20', 'ch02_21', 'ch02_22', 'test_question2_2_1', 'updatevid', 'ch07_update1', 'test_question2_10_1', 'test_question2_10_2', 'question2_10_3', 'scratch_02', 'ch02_1', 'ch02_2', 'ch02_3', 'ch02_4', 'ch02_5', 'ch02_6', 'ch02_7', 'ch02_8', 'ch02_8a', 'test_question2_1_1', 'test_question2_1_2', 'test_question2_4_1', 'assignvid', 'ch02_9', 'ch02_9_codelens', 'ch02_10', 'ch02_11', 'test_question2_3_2', 'typesnconvert', 'ch08_run3', 'scratch_08_04', 'ex_8_2', 'ex_8_3', 'q3_answer', 'ex_8_4', 'ex_7_10', 'q5_answer', 'ex_8_5', 'ex_8_6', 'q7_answer', 'ex_8_7', 'ex_8_8', 'q9_answer', 'ex_8_9', 'ex_8_10', 'q11_answer', 'ex_8_11', 'ex_8_12', 'q13_answer', 'ex_8_13', 'ex_8_14', 'q15_answer', 'ex_8_15', 'ex_8_16', 'q17_answer', 'ex_8_17', 'ex_8_18', 'q19_answer', 'ex_8_19', 'ex_8_20', 'q21_answer', 'chp08_index1', 'test_question8_2_1', 'test_question8_2_2', 'chp08_len1', 'chp08_len2', 'ch08_len3', 'test_question8_4_1', 'test_question8_4_2', 'chp08_fun2', 'ch08_add', 'ch08_mult', 'test_question8_1_1 ', 'test_question8_1_2', 'ch08_fun4', 'chp08_fun5', 'chp08_fun6', 'ch08_comp1', 'ch08_comp2', 'chp08_ord1', 'ch08_ord2', 'ch08_ord3', 'test_question8_6_1', 'test_question8_6_2', 'test_question8_6_3', 'chp08_upper', 'ch08_methods1', 'ch08_methods2', 'test_question8_3_1', 'test_question8_3_2', 'cg08_imm1', 'ch08_imm2', 'test_question8_7_1', 'ch08_acc1', 'ch08_acc2', 'test_question8_11_1', 'scratch_08_03', 'chp08_slice1', 'chp08_slice2', 'test_question8_5_1', 'test_question8_5_2', 'scratch_08_01', 'chp8_in1', 'chp8_in2', 'chp8_in3', 'ch08_7', 'ch08_7b', 'ch08_8', 'test_question8_9_1', 'ch08_4', 'ch08_5', 'ch08_6', 'test_question8_8_1', 'test_question8_8_2', 'ch08_7c', 'ch08_7c1', 'test_question8_10_1', 'scratch_08_02', 'string_lsys1', 'strings_lys2')\n",
- ");\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "961911 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 53,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 53
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Find and remove infered test courses or inactive courses"
- ]
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "In order to clean out test courses, we need to infer which book goes with each course. This will result in the creation of a bookcourses table, a table similar to courses but with the book tacked on the end."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "class CoursesForBook():\n",
- " def __init__(self,book):\n",
- " self.name = book\n",
- " self.courses = []\n",
- " self.keyWords = []\n",
- " def addCourse(self,course):\n",
- " if course not in self.courses:\n",
- " self.courses.append(course)\n",
- " def addKeyWords(self, wordlst):\n",
- " for word in wordlst:\n",
- " self.keyWords.append(word)\n",
- " def __str__(self):\n",
- " string = self.name + '\\n'+''\n",
- " for i in self.courses:\n",
- " string +=i + '\\n'\n",
- " return string"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 54
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "book = bp.makeBookOutline('pythonds')\n",
- "book2 = bp.makeBookOutline('thinkcspy')\n",
- "\n",
- "pythonds = CoursesForBook('pythonds')\n",
- "pythonds.addKeyWords(['Analysis','Trees','BasicDS'])\n",
- "thinkcspy = CoursesForBook('thinkcspy')\n",
- "thinkcspy.addKeyWords(['Labs','SimplePythonData','GeneralIntro', 'moreiteration', 'helloturtle'])"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 55
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "thinkcspy.courses"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 56,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 56
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "query = '''\n",
- "SELECT distinct course_id, div_id\n",
- "FROM useinfo where event = 'page'\n",
- "'''\n",
- "df = psql.read_sql(query,conn)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 57
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "count = 0\n",
- "for num in df.index:\n",
- " count += 1\n",
- " if count % 500000 == 1:\n",
- " print(count)\n",
- " row = df.iloc[num]\n",
- " course = row.course_id\n",
- " div = row.div_id\n",
- " for word in pythonds.keyWords:\n",
- " try:\n",
- " if word in div :\n",
- " pythonds.addCourse(course)\n",
- " except TypeError:\n",
- " pass\n",
- " for word in thinkcspy.keyWords:\n",
- " try:\n",
- " if word in div :\n",
- " thinkcspy.addCourse(course)\n",
- " except TypeError:\n",
- " pass"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1\n"
- ]
- }
- ],
- "prompt_number": 58
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "len(pythonds.courses)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 59,
- "text": [
- "14"
- ]
- }
- ],
- "prompt_number": 59
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "q = '''\n",
- "SELECT * FROM courses\n",
- "'''\n",
- "course_ids = psql.read_sql(q, conn)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 60
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "These two lists are the same lists as pythonds.courses and thinkcspy.courses. I simply copied them to save time. "
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#temp1 = ['pythonds', 'COSC122', 'gatech', 'cs151ranum', 'cs22lbcc', 'cs22-lbcc', 'PaideiaCS2', 'CS136', 'GFU-Algorithms', 'GUDataStructures', 'PythonLevel2', 'luther151', 'devcourse']\n",
- "#temp2 = ['thinkcspy', 'aaron-thinkpy', 'IntroCompScience', 'CSC131HUA', 'CSC131004Fall2013', 'STEM_LWSD', 'CSC131tagliarini', 'CS115', 'GalileoAOIT', 'green', '729G04-HT2013', 'ISTA130_FALL_2013', 'FlemingPSP', 'Duke_CompSci101', 'METR1313', 'CS_Python', 'HPComSci_IB', 'CS1-Python', 'PSSD_Python', 'cs150ranum', 'CS30', '152', 'luther150c_fall13', 'SER101F2013', 'scacs', 'IHSPython', 'CS101', 'python_13', 'SMU_CS101', 'CS1140', 'CS140-WCC-F13', 'ISYS217', 'python', 'ICS2O-3C', 'LearnPython', '10ACS', 'Python', 'ITLP-Python', 'csf13f', 'secondp', 'FCHSCP2013', 'wspython', '01418116_56', 'qwerty', 'Thacker', 'IntroProgrammingJHS', 'Griess_Python', 'ParsonsTest', 'ComputerProgramming', 'CSIS101', 'MrEasonECSFall2013', 'CS@WPS', 'CS_20', 'python4lindseya', 'CSC226', 'BC_GCSECompSci', 'lsmsacs244v', 'pyladies', 'CS244', 'ITEC-1150-01-S14', '152Haugrud', 'CSC131tompkinsj', 'ComputerLiteracyPython', 'bcscomputing', 'BigSnake', 'CSC112800', 'testcamp', 'IC-COMP-171-SP-14', 'ProjectNorKam', 'cs110williams', 'CSIS152Spr2014Ficek', 'SUS13', 'SMU_CS106', 'CSIS101W14', 'ANewCourse', 'SBC2014', 'CSGPython2', 'comp123-s14', 'cs150ranumspr2014', 'NewmanSKE2014', 'CS902', 'peeblescourse', 'Spring2014CS104', 'cf_pyintro', 'HTHcomptech', 'PythonASMSA', 'CodingLevel1', 'PythonLevel1', 'CASMTFeb14', 'Pythonintro', 'PythonIntro', 'neintrocs', 'CSIT902', 'CS902Spr14', 'cap-comp101', 'cs-147-s14', 'LASAIntroCS', 'cs902python', 'APCS-Python', 'WCC_CS140_SP14', 'RMBSProgrammingSkills', 'SolinMediaComp', 'csTestPrject', '1314SP_HCT2447', 'ITLP', 'CS101SP14', 'IS1008', 'ProvaAnn77Corso', 'BeensPy', 'python4bio', 'gatech', 'bet_test', 'devcourse', 'pyocc', 'craig-trial', 'CSC112-800', 'comp115', 'unics1140', 'luther150a', 'luther150bc', 'CMPUT 174 Fall 2012', 'CS-1P', 'CMPUT174Fall2012', 'HCHS_GCSE_Python3', 'overview', 'PHBCS', 'HCHS_GCSE_Python', 'COMP201', 'Explorations', 'SKECS2012PythonI', 'SKECS2013PythonI', 'Python101', 'UniOfHertsCS101', 'BMO', 'AMSACS', 'C152_Sum', 'NZTeach', 'IntroCC', 'Summer2013', 'CBCS_CS101', 'PythonWorkshop', 'OCC_CS_A131_FALL_13', 'Python1', 'bic-think-tutorial', 'test_tssi']"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 61
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "If running anew, replace temp1 and temp2 with pythonds.courses and thinkcspy.courses respectively."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "f = (lambda x: 'pythonds' if x in pythonds.courses else None or 'thinkcspy' if x in thinkcspy.courses else None)\n",
- "course_ids['book'] = course_ids['course_name'].map(f)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 62
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "course_ids\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>id</th>\n",
- " <th>course_id</th>\n",
- " <th>course_name</th>\n",
- " <th>term_start_date</th>\n",
- " <th>book</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0 </th>\n",
- " <td> 144</td>\n",
- " <td> None</td>\n",
- " <td> CSC131tagliarini</td>\n",
- " <td> 2013-08-20</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1 </th>\n",
- " <td> 2</td>\n",
- " <td> pythonds</td>\n",
- " <td> pythonds</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> pythonds</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2 </th>\n",
- " <td> 3</td>\n",
- " <td> thinkcspy</td>\n",
- " <td> thinkcspy</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3 </th>\n",
- " <td> 139</td>\n",
- " <td> None</td>\n",
- " <td> HPComSci_IB</td>\n",
- " <td> 2013-08-17</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4 </th>\n",
- " <td> 116</td>\n",
- " <td> None</td>\n",
- " <td> a</td>\n",
- " <td> 2013-08-07</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5 </th>\n",
- " <td> 6</td>\n",
- " <td> luther151</td>\n",
- " <td> luther151</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> pythonds</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>6 </th>\n",
- " <td> 8</td>\n",
- " <td> python4bio</td>\n",
- " <td> python4bio</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7 </th>\n",
- " <td> 88</td>\n",
- " <td> Python 101</td>\n",
- " <td> Python 101</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8 </th>\n",
- " <td> 9</td>\n",
- " <td> BeensPy</td>\n",
- " <td> BeensPy</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>9 </th>\n",
- " <td> 10</td>\n",
- " <td> pyocc</td>\n",
- " <td> pyocc</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>10 </th>\n",
- " <td> 11</td>\n",
- " <td> bet_test</td>\n",
- " <td> bet_test</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>11 </th>\n",
- " <td> 117</td>\n",
- " <td> None</td>\n",
- " <td> Moffdawg</td>\n",
- " <td> 2014-02-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>12 </th>\n",
- " <td> 119</td>\n",
- " <td> None</td>\n",
- " <td> web2py</td>\n",
- " <td> 2013-08-12</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>13 </th>\n",
- " <td> 7</td>\n",
- " <td> COSC122</td>\n",
- " <td> COSC122</td>\n",
- " <td> 2013-07-07</td>\n",
- " <td> pythonds</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>14 </th>\n",
- " <td> 118</td>\n",
- " <td> None</td>\n",
- " <td> STEM 10th Grade Computer Programming</td>\n",
- " <td> 2013-08-07</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>15 </th>\n",
- " <td> 13</td>\n",
- " <td> CS 115</td>\n",
- " <td> CS 115</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>16 </th>\n",
- " <td> 14</td>\n",
- " <td> CS115</td>\n",
- " <td> CS115</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>17 </th>\n",
- " <td> 15</td>\n",
- " <td> NorKam Programming With Python</td>\n",
- " <td> NorKam Programming With Python</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>18 </th>\n",
- " <td> 4</td>\n",
- " <td> gatech</td>\n",
- " <td> gatech</td>\n",
- " <td> 2013-08-09</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>19 </th>\n",
- " <td> 120</td>\n",
- " <td> None</td>\n",
- " <td> Web2py sem Segredos</td>\n",
- " <td> 2013-08-10</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>20 </th>\n",
- " <td> 121</td>\n",
- " <td> None</td>\n",
- " <td> Prj1</td>\n",
- " <td> 2013-08-10</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>21 </th>\n",
- " <td> 122</td>\n",
- " <td> None</td>\n",
- " <td> test</td>\n",
- " <td> 2013-08-10</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>22 </th>\n",
- " <td> 124</td>\n",
- " <td> None</td>\n",
- " <td> green</td>\n",
- " <td> 2013-08-12</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>23 </th>\n",
- " <td> 16</td>\n",
- " <td> wm_cs20</td>\n",
- " <td> wm_cs20</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>24 </th>\n",
- " <td> 151</td>\n",
- " <td> None</td>\n",
- " <td> algopython</td>\n",
- " <td> 2013-08-23</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>25 </th>\n",
- " <td> 125</td>\n",
- " <td> None</td>\n",
- " <td> AppliedGenomeInformatics</td>\n",
- " <td> 2014-09-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>26 </th>\n",
- " <td> 126</td>\n",
- " <td> None</td>\n",
- " <td> AppliedGenomeInformatics2</td>\n",
- " <td> 2014-09-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>27 </th>\n",
- " <td> 17</td>\n",
- " <td> craig-trial</td>\n",
- " <td> craig-trial</td>\n",
- " <td> 2011-01-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>28 </th>\n",
- " <td> 152</td>\n",
- " <td> None</td>\n",
- " <td> none</td>\n",
- " <td> 2013-08-23</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>29 </th>\n",
- " <td> 128</td>\n",
- " <td> None</td>\n",
- " <td> ISTA130_FALL_2013</td>\n",
- " <td> 2013-08-26</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>380</th>\n",
- " <td> 334</td>\n",
- " <td> None</td>\n",
- " <td> Spring2014CS104</td>\n",
- " <td> 2014-02-01</td>\n",
- " <td> thinkcspy</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>381</th>\n",
- " <td> 470</td>\n",
- " <td> None</td>\n",
- " <td> Choice</td>\n",
- " <td> 2014-06-02</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>382</th>\n",
- " <td> 499</td>\n",
- " <td> None</td>\n",
- " <td> sci30x</td>\n",
- " <td> 2014-06-19</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>383</th>\n",
- " <td> 500</td>\n",
- " <td> None</td>\n",
- " <td> sci30x2</td>\n",
- " <td> 2014-12-11</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>384</th>\n",
- " <td> 501</td>\n",
- " <td> None</td>\n",
- " <td> Decidability</td>\n",
- " <td> 2014-06-19</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>385</th>\n",
- " <td> 471</td>\n",
- " <td> None</td>\n",
- " <td> choice</td>\n",
- " <td> 2014-05-27</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>386</th>\n",
- " <td> 475</td>\n",
- " <td> None</td>\n",
- " <td> bnmtest91</td>\n",
- " <td> 2014-05-28</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>387</th>\n",
- " <td> 477</td>\n",
- " <td> None</td>\n",
- " <td> bnmtest92</td>\n",
- " <td> 2014-05-28</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>388</th>\n",
- " <td> 476</td>\n",
- " <td> None</td>\n",
- " <td> CS104Fall2014</td>\n",
- " <td> 2014-09-02</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>389</th>\n",
- " <td> 478</td>\n",
- " <td> None</td>\n",
- " <td> CCPS_Python</td>\n",
- " <td> 2014-05-29</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>390</th>\n",
- " <td> 479</td>\n",
- " <td> None</td>\n",
- " <td> ru_python</td>\n",
- " <td> 2014-06-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>391</th>\n",
- " <td> 481</td>\n",
- " <td> None</td>\n",
- " <td> TestCourse131234</td>\n",
- " <td> 2014-06-04</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>392</th>\n",
- " <td> 482</td>\n",
- " <td> None</td>\n",
- " <td> TestCourse131235</td>\n",
- " <td> 2014-06-04</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>393</th>\n",
- " <td> 483</td>\n",
- " <td> None</td>\n",
- " <td> TestCourse131236</td>\n",
- " <td> 2014-06-04</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>394</th>\n",
- " <td> 484</td>\n",
- " <td> None</td>\n",
- " <td> NYU-CSCI-UA2-005</td>\n",
- " <td> 2014-09-01</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>395</th>\n",
- " <td> 485</td>\n",
- " <td> None</td>\n",
- " <td> TestCourse131237</td>\n",
- " <td> 2014-06-04</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>396</th>\n",
- " <td> 486</td>\n",
- " <td> None</td>\n",
- " <td> lbcc-cs22</td>\n",
- " <td> 2014-06-05</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>397</th>\n",
- " <td> 487</td>\n",
- " <td> None</td>\n",
- " <td> WormsleyTest</td>\n",
- " <td> 2014-06-05</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>398</th>\n",
- " <td> 488</td>\n",
- " <td> None</td>\n",
- " <td> Programming1</td>\n",
- " <td> 2014-06-06</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>399</th>\n",
- " <td> 489</td>\n",
- " <td> None</td>\n",
- " <td> cchspython</td>\n",
- " <td> 2014-06-06</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>400</th>\n",
- " <td> 472</td>\n",
- " <td> None</td>\n",
- " <td> ChoiceCoding</td>\n",
- " <td> 2014-05-27</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>401</th>\n",
- " <td> 490</td>\n",
- " <td> None</td>\n",
- " <td> FirenzePasA042</td>\n",
- " <td> 2014-06-11</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>402</th>\n",
- " <td> 491</td>\n",
- " <td> None</td>\n",
- " <td> tp-Python</td>\n",
- " <td> 2014-06-12</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>403</th>\n",
- " <td> 492</td>\n",
- " <td> None</td>\n",
- " <td> Univ-of-Wash-CSE-373-Autumn-2014</td>\n",
- " <td> 2014-09-24</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>404</th>\n",
- " <td> 493</td>\n",
- " <td> None</td>\n",
- " <td> Ordinamentovettori</td>\n",
- " <td> 2014-06-16</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>405</th>\n",
- " <td> 494</td>\n",
- " <td> None</td>\n",
- " <td> CS902SU14</td>\n",
- " <td> 2014-06-16</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>406</th>\n",
- " <td> 495</td>\n",
- " <td> None</td>\n",
- " <td> 123</td>\n",
- " <td> 2014-06-16</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>407</th>\n",
- " <td> 503</td>\n",
- " <td> None</td>\n",
- " <td> ICS3U</td>\n",
- " <td> 2014-06-18</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>408</th>\n",
- " <td> 504</td>\n",
- " <td> None</td>\n",
- " <td> Learn_Pythom</td>\n",
- " <td> 2014-06-20</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>409</th>\n",
- " <td> 507</td>\n",
- " <td> None</td>\n",
- " <td> WestlakeCSP</td>\n",
- " <td> 2014-06-22</td>\n",
- " <td> None</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>410 rows \u00d7 5 columns</p>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 63,
- "text": [
- " id course_id \\\n",
- "0 144 None \n",
- "1 2 pythonds \n",
- "2 3 thinkcspy \n",
- "3 139 None \n",
- "4 116 None \n",
- "5 6 luther151 \n",
- "6 8 python4bio \n",
- "7 88 Python 101 \n",
- "8 9 BeensPy \n",
- "9 10 pyocc \n",
- "10 11 bet_test \n",
- "11 117 None \n",
- "12 119 None \n",
- "13 7 COSC122 \n",
- "14 118 None \n",
- "15 13 CS 115 \n",
- "16 14 CS115 \n",
- "17 15 NorKam Programming With Python \n",
- "18 4 gatech \n",
- "19 120 None \n",
- "20 121 None \n",
- "21 122 None \n",
- "22 124 None \n",
- "23 16 wm_cs20 \n",
- "24 151 None \n",
- "25 125 None \n",
- "26 126 None \n",
- "27 17 craig-trial \n",
- "28 152 None \n",
- "29 128 None \n",
- ".. ... ... \n",
- "380 334 None \n",
- "381 470 None \n",
- "382 499 None \n",
- "383 500 None \n",
- "384 501 None \n",
- "385 471 None \n",
- "386 475 None \n",
- "387 477 None \n",
- "388 476 None \n",
- "389 478 None \n",
- "390 479 None \n",
- "391 481 None \n",
- "392 482 None \n",
- "393 483 None \n",
- "394 484 None \n",
- "395 485 None \n",
- "396 486 None \n",
- "397 487 None \n",
- "398 488 None \n",
- "399 489 None \n",
- "400 472 None \n",
- "401 490 None \n",
- "402 491 None \n",
- "403 492 None \n",
- "404 493 None \n",
- "405 494 None \n",
- "406 495 None \n",
- "407 503 None \n",
- "408 504 None \n",
- "409 507 None \n",
- "\n",
- " course_name term_start_date book \n",
- "0 CSC131tagliarini 2013-08-20 thinkcspy \n",
- "1 pythonds 2011-01-01 pythonds \n",
- "2 thinkcspy 2011-01-01 thinkcspy \n",
- "3 HPComSci_IB 2013-08-17 thinkcspy \n",
- "4 a 2013-08-07 None \n",
- "5 luther151 2011-01-01 pythonds \n",
- "6 python4bio 2011-01-01 thinkcspy \n",
- "7 Python 101 2011-01-01 None \n",
- "8 BeensPy 2011-01-01 thinkcspy \n",
- "9 pyocc 2011-01-01 thinkcspy \n",
- "10 bet_test 2011-01-01 thinkcspy \n",
- "11 Moffdawg 2014-02-01 thinkcspy \n",
- "12 web2py 2013-08-12 None \n",
- "13 COSC122 2013-07-07 pythonds \n",
- "14 STEM 10th Grade Computer Programming 2013-08-07 None \n",
- "15 CS 115 2011-01-01 None \n",
- "16 CS115 2011-01-01 thinkcspy \n",
- "17 NorKam Programming With Python 2011-01-01 None \n",
- "18 gatech 2013-08-09 None \n",
- "19 Web2py sem Segredos 2013-08-10 None \n",
- "20 Prj1 2013-08-10 None \n",
- "21 test 2013-08-10 None \n",
- "22 green 2013-08-12 thinkcspy \n",
- "23 wm_cs20 2011-01-01 None \n",
- "24 algopython 2013-08-23 None \n",
- "25 AppliedGenomeInformatics 2014-09-01 None \n",
- "26 AppliedGenomeInformatics2 2014-09-01 None \n",
- "27 craig-trial 2011-01-01 None \n",
- "28 none 2013-08-23 None \n",
- "29 ISTA130_FALL_2013 2013-08-26 thinkcspy \n",
- ".. ... ... ... \n",
- "380 Spring2014CS104 2014-02-01 thinkcspy \n",
- "381 Choice 2014-06-02 None \n",
- "382 sci30x 2014-06-19 None \n",
- "383 sci30x2 2014-12-11 None \n",
- "384 Decidability 2014-06-19 None \n",
- "385 choice 2014-05-27 None \n",
- "386 bnmtest91 2014-05-28 None \n",
- "387 bnmtest92 2014-05-28 None \n",
- "388 CS104Fall2014 2014-09-02 None \n",
- "389 CCPS_Python 2014-05-29 None \n",
- "390 ru_python 2014-06-01 None \n",
- "391 TestCourse131234 2014-06-04 None \n",
- "392 TestCourse131235 2014-06-04 None \n",
- "393 TestCourse131236 2014-06-04 None \n",
- "394 NYU-CSCI-UA2-005 2014-09-01 None \n",
- "395 TestCourse131237 2014-06-04 None \n",
- "396 lbcc-cs22 2014-06-05 None \n",
- "397 WormsleyTest 2014-06-05 None \n",
- "398 Programming1 2014-06-06 None \n",
- "399 cchspython 2014-06-06 None \n",
- "400 ChoiceCoding 2014-05-27 None \n",
- "401 FirenzePasA042 2014-06-11 None \n",
- "402 tp-Python 2014-06-12 None \n",
- "403 Univ-of-Wash-CSE-373-Autumn-2014 2014-09-24 None \n",
- "404 Ordinamentovettori 2014-06-16 None \n",
- "405 CS902SU14 2014-06-16 None \n",
- "406 123 2014-06-16 None \n",
- "407 ICS3U 2014-06-18 None \n",
- "408 Learn_Pythom 2014-06-20 None \n",
- "409 WestlakeCSP 2014-06-22 None \n",
- "\n",
- "[410 rows x 5 columns]"
- ]
- }
- ],
- "prompt_number": 63
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "course_ids.to_sql('bookcourses',engine2,if_exists='replace')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 64
- },
- {
- "cell_type": "heading",
- "level": 6,
- "metadata": {},
- "source": [
- "Removal starts here"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "CREATE INDEX bookIndex\n",
- "ON bookcourses (index);\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "Done.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 65,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 65
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "query = '''\n",
- "SELECT book, course_id ,clicks,numDivs,numSids FROM\n",
- "(\n",
- "SELECT book, course_name\n",
- "FROM bookcourses\n",
- ") AS S\n",
- "JOIN\n",
- "(SELECT course_id, COUNT(*) AS clicks, COUNT(DISTINCT div_id) AS numDivs, COUNT(DISTINCT sid) AS numSids\n",
- "FROM useinfo\n",
- "GROUP BY course_id\n",
- ") AS T\n",
- "ON (S.course_name = T.course_id)\n",
- "'''\n",
- "\n",
- "course_idInfo = psql.read_sql(query, conn)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 66
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#Collect all of the courses that don't have a book associated with them\n",
- "noBook = course_idInfo[course_idInfo['book'] != 'pythonds']\n",
- "noBook = noBook[noBook['book'] != 'thinkcspy']"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 67
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df = course_idInfo\n",
- "df1 = df[df['numdivs'] < 5]\n",
- "df2 = df[df['numsids'] < 5]\n",
- "df3 = df[df['clicks']<100]\n",
- "df4 = pd.merge(df1,df2,how='outer')\n",
- "df5 = pd.merge(df3,df4,how='outer')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 68
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "remove = pd.merge(df5,noBook, how = 'outer')['course_id']\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 69
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "remove"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 70,
- "text": [
- "0 9my0\n",
- "1 ACcomp11\n",
- "2 APCS-Python\n",
- "3 AppliedGenomeInformatics2\n",
- "4 BASE\n",
- "5 BC_GCSECompSci\n",
- "6 C152_Sum\n",
- "7 CBCS\n",
- "8 CS140\n",
- "9 CSC100_J_term\n",
- "10 CSIS152\n",
- "11 CSIT902\n",
- "12 CS_Python\n",
- "13 CodingLevel1\n",
- "14 DAM_00\n",
- "...\n",
- "75 septtest1\n",
- "76 stolaf121AS14\n",
- "77 tc_1\n",
- "78 test_tssi\n",
- "79 testcamp\n",
- "80 testproject\n",
- "81 IntroCC\n",
- "82 MrEasonECSFall2013\n",
- "83 NZTeach\n",
- "84 PythonASMSA\n",
- "85 SKECS2012PythonI\n",
- "86 WPSCS\n",
- "87 JavaReview\n",
- "88 Learning\n",
- "89 SER101F2013\n",
- "Name: course_id, Length: 90, dtype: object"
- ]
- }
- ],
- "prompt_number": 70
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "#This is for the clean database notebook. After making a table of the removed items, I'll remove those course_ids from useinfo \n",
- "remove.to_sql('outlier_courses',engine2,if_exists='replace')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 71
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "CREATE INDEX ourlier_course_Index\n",
- "ON outlier_courses (index);\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "Done.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 72,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 72
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE course_id IN \n",
- "(SELECT course_id FROM outlier_courses);\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "24827 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 73,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 73
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Delete Single Div Users from the database"
- ]
- },
- {
- "cell_type": "heading",
- "level": 5,
- "metadata": {},
- "source": [
- "Pythonds"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo WHERE id IN\n",
- "(\n",
- "SELECT useinfo.id FROM useinfo JOIN bookcourses\n",
- " ON (useinfo.course_id = bookcourses.course_name)\n",
- "WHERE sid IN\n",
- "(\n",
- "SELECT sid\n",
- "FROM \n",
- "(\n",
- " SELECT DISTINCT sid,div_id\n",
- " FROM useinfo JOIN bookcourses\n",
- " ON (useinfo.course_id = bookcourses.course_name)\n",
- " WHERE book = 'pythonds'\n",
- ") AS foo\n",
- "GROUP BY sid HAVING count(*) = 1\n",
- ")\n",
- "AND book = 'pythonds');\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "432951 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 74,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 74
- },
- {
- "cell_type": "heading",
- "level": 5,
- "metadata": {},
- "source": [
- "Thinkcspy"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo WHERE id IN\n",
- "(\n",
- "SELECT useinfo.id FROM useinfo JOIN bookcourses\n",
- " ON (useinfo.course_id = bookcourses.course_name)\n",
- "WHERE sid IN\n",
- "(\n",
- "SELECT sid\n",
- "FROM \n",
- "(\n",
- " SELECT DISTINCT sid,div_id\n",
- " FROM useinfo JOIN bookcourses\n",
- " ON (useinfo.course_id = bookcourses.course_name)\n",
- " WHERE book = 'thinkcspy'\n",
- ") AS foo\n",
- "GROUP BY sid HAVING count(*) = 1\n",
- ")\n",
- "AND book = 'thinkcspy');\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "520677 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 75,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 75
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Delete Short Term Users from the database"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE sid IN\n",
- "(\n",
- "--#Select all users whose duration on the website was less than an hour \n",
- " SELECT sid FROM\n",
- " \n",
- " (\n",
- " SELECT sid,timestamp,event,act,div_id,course_id \n",
- " FROM \n",
- " (\n",
- " SELECT sid FROM useinfo\n",
- " GROUP BY sid HAVING COUNT(*) > 1\n",
- " ) AS T \n",
- " NATURAL JOIN \n",
- " (\n",
- " SELECT sid,min(timestamp) startd, max(timestamp) endd FROM useinfo\n",
- " GROUP BY sid\n",
- " ) AS S \n",
- " NATURAL JOIN \n",
- " useinfo\n",
- " WHERE (endd - startd) < ('1 day')::interval\n",
- " ) AS mvu\n",
- ")\n",
- ";\n",
- "COMMIT;\n",
- " \n",
- " \n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "3164790 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 76,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 76
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- " %%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE sid IN\n",
- "(\n",
- " SELECT sid FROM\n",
- "--#Select all the users with only one action. \n",
- " (\n",
- " SELECT sid,timestamp,event,act,div_id,course_id FROM \n",
- " (\n",
- " SELECT sid FROM useinfo GROUP BY sid HAVING COUNT(*) < 2\n",
- " ) AS R \n",
- " NATURAL JOIN \n",
- " useinfo\n",
- " \n",
- " ) AS svu\n",
- " \n",
- ")\n",
- ";\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "10451 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 77,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 77
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 77
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Remove outliers with respect to clicks and duration."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "query = ''' \n",
- "SELECT sid, min(timestamp) startd, max(timestamp) endd\n",
- "FROM useinfo\n",
- "GROUP BY sid\n",
- "'''\n",
- "\n",
- "d = psql.frame_query(query,conn)\n",
- "d.head()"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>sid</th>\n",
- " <th>startd</th>\n",
- " <th>endd</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td> bcselvitazurkevica</td>\n",
- " <td>2012-10-02 08:15:40</td>\n",
- " <td>2014-02-11 21:10:49</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td> jPayne</td>\n",
- " <td>2013-09-10 15:43:59</td>\n",
- " <td>2013-09-29 20:55:51</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td> lisadawn</td>\n",
- " <td>2013-08-21 21:21:58</td>\n",
- " <td>2013-08-26 19:49:02</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td> mackedee</td>\n",
- " <td>2013-08-27 23:46:49</td>\n",
- " <td>2013-09-10 01:17:01</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td> +4dB</td>\n",
- " <td>2013-07-14 01:20:04</td>\n",
- " <td>2013-10-30 00:11:56</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 78,
- "text": [
- " sid startd endd\n",
- "0 bcselvitazurkevica 2012-10-02 08:15:40 2014-02-11 21:10:49\n",
- "1 jPayne 2013-09-10 15:43:59 2013-09-29 20:55:51\n",
- "2 lisadawn 2013-08-21 21:21:58 2013-08-26 19:49:02\n",
- "3 mackedee 2013-08-27 23:46:49 2013-09-10 01:17:01\n",
- "4 +4dB 2013-07-14 01:20:04 2013-10-30 00:11:56"
- ]
- }
- ],
- "prompt_number": 78
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "d['duration'] = d['endd'] - d['startd']\n",
- "d['durdays'] = d['duration'].map(lambda x: x.astype('timedelta64[D]')) #takes a function as a parameter and"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 79
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "df = d[d.duration > np.timedelta64(1,'D')]\n",
- "#df = df[df.sid != 'bmiller']\n",
- "#df = df[df.sid != 'ranum' ]\n",
- "df.index = df.sid"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 80
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "q = '''\n",
- "SELECT sid, COUNT(*) AS numvisits FROM useinfo\n",
- "GROUP BY sid \n",
- "ORDER BY numvisits\n",
- "'''\n",
- "visitf = psql.frame_query(q,conn)\n",
- "visitf.index = visitf.sid"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 81
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "dfr = df['sid'].values\n",
- "dfrs = set(dfr)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 82
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "count = 1\n",
- "pairs = []\n",
- "for u in df.index:\n",
- " if u in visitf.index:\n",
- " count += 1\n",
- " pairs.append((int(df.ix[u].durdays)/1000000000/(60*60*24),visitf.ix[u].numvisits, df.ix[u].sid ))"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 83
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "\n",
- "pf = DataFrame( {'duration':[x[0] for x in pairs], 'clicks':[x[1] for x in pairs],'sid': [x[2] for x in pairs] } )\n"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 84
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "def getOutliers(dataframe, dimension):\n",
- " mean = dataframe[dimension].mean()\n",
- " std = dataframe[dimension].std() \n",
- "\n",
- " newDF = dataframe[(dataframe[dimension] - mean)/std < 3.5]\n",
- " return newDF\n",
- " print(mean, \" \" , std)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 85
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "temp = getOutliers(pf, 'clicks') #Finds outliers with respects to clicks\n",
- "temp1 = getOutliers(pf, 'duration') #Finds the rest of the outliers with respect to duration. "
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 86
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "temp1.head()"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>clicks</th>\n",
- " <th>duration</th>\n",
- " <th>sid</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td> 66</td>\n",
- " <td> 19</td>\n",
- " <td> jPayne</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td> 544</td>\n",
- " <td> 4</td>\n",
- " <td> lisadawn</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td> 209</td>\n",
- " <td> 13</td>\n",
- " <td> mackedee</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td> 171</td>\n",
- " <td> 107</td>\n",
- " <td> +4dB</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td> 1563</td>\n",
- " <td> 111</td>\n",
- " <td> 0v3r2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 87,
- "text": [
- " clicks duration sid\n",
- "1 66 19 jPayne\n",
- "2 544 4 lisadawn\n",
- "3 209 13 mackedee\n",
- "4 171 107 +4dB\n",
- "5 1563 111 0v3r2"
- ]
- }
- ],
- "prompt_number": 87
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "outliers = pd.merge(temp,temp1, how ='outer', on ='sid')\n",
- "temp1.to_sql('outlierssid',engine2,if_exists='replace')"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 88
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "CREATE INDEX outlierIndex\n",
- "ON outlierssid (sid);\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "Done.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 89,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 89
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "DELETE FROM useinfo\n",
- "WHERE sid IN \n",
- " (SELECT sid \n",
- " FROM useinfo\n",
- " NATURAL JOIN \n",
- " (\n",
- " SELECT sid \n",
- " FROM useinfo\n",
- " EXCEPT \n",
- " (SELECT sid \n",
- " FROM outlierssid\n",
- " )\n",
- " ) \n",
- " AS T\n",
- " );\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "628996 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 90,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 90
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "SELECT count(*) FROM useinfo;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1 rows affected.\n"
- ]
- },
- {
- "html": [
- "<table>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>9357764</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 91,
- "text": [
- "[(9357764,)]"
- ]
- }
- ],
- "prompt_number": 91
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Replace courselib with runestone"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "UPDATE useinfo\n",
- "SET div_id = regexp_replace(div_id,'^/courselib(.*)','/runestone\\1');\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "9357764 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 92,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 92
- },
- {
- "cell_type": "heading",
- "level": 3,
- "metadata": {},
- "source": [
- "Replace mChoice with mchoice"
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql \n",
- "UPDATE useinfo\n",
- "SET event = 'mchoice'\n",
- "WHERE event = 'mChoice';\n",
- "\n",
- "COMMIT;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1166988 rows affected.\n",
- "Done.\n"
- ]
- },
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 93,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 93
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "SELECT * FROM useinfo\n",
- "where event = 'mChoice';"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "0 rows affected.\n"
- ]
- },
- {
- "html": [
- "<table>\n",
- " <tr>\n",
- " <th>id</th>\n",
- " <th>timestamp</th>\n",
- " <th>sid</th>\n",
- " <th>event</th>\n",
- " <th>act</th>\n",
- " <th>div_id</th>\n",
- " <th>course_id</th>\n",
- " </tr>\n",
- "</table>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 94,
- "text": [
- "[]"
- ]
- }
- ],
- "prompt_number": 94
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "SELECT count(*) FROM (\n",
- "SELECT DISTINCT sid FROM useinfo) AS t;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1 rows affected.\n"
- ]
- },
- {
- "html": [
- "<table>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>81894</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 95,
- "text": [
- "[(81894,)]"
- ]
- }
- ],
- "prompt_number": 95
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "%%sql\n",
- "SELECT count(*) FROM useinfo;"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "1 rows affected.\n"
- ]
- },
- {
- "html": [
- "<table>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <td>9357764</td>\n",
- " </tr>\n",
- "</table>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 96,
- "text": [
- "[(9357764,)]"
- ]
- }
- ],
- "prompt_number": 96
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 96
- }
- ],
- "metadata": {}
- }
- ]
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement