Untitled

{
 "metadata": {
  "name": "",
  "signature": "sha256:6bdb7954cfe1c233e9b8cf2e3978cb41700c561f90d847e1fba49089cbba4b0f"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# built-in python library for interacting with .csv files\n",
      "import csv\n",
      "# natural language toolkit is a great python library for natural language processing\n",
      "import nltk\n",
      "# built-in python library for utility functions that introduce randomness\n",
      "import random\n",
      "# built-in python library for measuring time-related things\n",
      "import time\n",
      " \n",
      "print \"test\"\n",
      " \n",
      "def get_length_bucket(sms_length):\n",
      "    \"\"\"\n",
      "    buckets the SMS length into either short / medium / long\n",
      "    \"\"\"\n",
      "    if sms_length < 20:\n",
      "        return \"short\"\n",
      "    elif sms_length < 50:\n",
      "        return \"medium\"\n",
      "    else:\n",
      "        return \"long\"\n",
      "    \n",
      "    \n",
      "def sms_features(sms):\n",
      "    \"\"\"\n",
      "     Returns a dictionary of the features of the sms we want our model\n",
      "    to be based on, e.g. sms_length.\n",
      " \n",
      "    So if the tweet was \"Hey!\", the output of this function would be\n",
      "    {\n",
      "        \"length\": \"short\"\n",
      "    }\n",
      " \n",
      "    If the tweet was \"Hey this is a really great idea and I think that we should totally implement this technique\",\n",
      "    then the output would be\n",
      "    {\n",
      "        \"length\": \"long\"\n",
      "    }\n",
      "    \"\"\"\n",
      "    return {\n",
      "        \"length\": get_length_bucket(len(sms)),\n",
      "        'contains_reply': 'reply' in sms,\n",
      "        'contains_winner': 'winner' in sms,\n",
      " \n",
      "    }\n",
      " \n",
      " \n",
      " \n",
      "def get_feature_sets():\n",
      "    \"\"\"\n",
      "    # Step 1: This reads in the rows from the csv file which look like this:\n",
      "    0, I'm so sad\n",
      "    1, Happy!\n",
      " \n",
      "    where the first row is the label; 0=negative, 1=positive\n",
      "    and the second row is the body of the SMS\n",
      " \n",
      "    # Step 2: Turn the csv rows into feature dictionaries using `sms_features` function above.\n",
      " \n",
      "    The output of this function run on the example in Step 1 will look like this:\n",
      "    [\n",
      "        ({\"length\": \"short\"}, 0), # this corresponds to 0, I'm so sad\n",
      "        ({\"length\": \"short\"}, 1) # this corresponds to 1, Happy!\n",
      "    ]\n",
      " \n",
      "    You can think about this more abstractly as this:\n",
      "    [\n",
      "        (feature_dictionary, label), # corresponding to row 0\n",
      "        ... # corresponding to row 1 ... n\n",
      "    ]\n",
      "    \"\"\"\n",
      "    # open the file, which we've placed at /home/vagrant/repos/datasets/sms_spam_or_ham.csv\n",
      "    # 'rb' means read-only mode and binary encoding\n",
      "    f = open('/home/vagrant/repos/datasets/sms_spam_or_ham.csv', 'rb')\n",
      " \n",
      "    # let's read in the rows from the csv file\n",
      "    rows = []\n",
      "    for row in csv.reader(f):\n",
      "        rows.append(row)\n",
      " \n",
      "    # now let's generate the output that we specified in the comments above\n",
      "    output_data = []\n",
      " \n",
      "    # let's just run it on 100,000 rows first, instead of all 1.5 million rows\n",
      "    # when you experiment with the `sms_features` function to improve accuracy\n",
      "    # feel free to get rid of the row limit and just run it on the whole set\n",
      "    for row in rows[:100000]:\n",
      "        # Remember that row[0] is the label, either 0 or 1\n",
      "        # and row[1] is the tweet body\n",
      " \n",
      "        # get the label\n",
      "        label = row[0]\n",
      " \n",
      "        # get the tweet body and compute the feature dictionary\n",
      "        try:\n",
      "            feature_dict = sms_features(row[1])\n",
      "        except:\n",
      "            continue\n",
      "        # add the tuple of feature_dict, label to output_data\n",
      "        data = (feature_dict, label)\n",
      "        output_data.append(data)\n",
      " \n",
      "    # close the file\n",
      "    f.close()\n",
      "    return output_data\n",
      " \n",
      "def get_training_and_validation_sets(feature_sets):\n",
      "    \"\"\"\n",
      "    This takes the output of `get_feature_sets`, randomly shuffles it to ensure we're\n",
      "    taking an unbiased sample, and then splits the set of features into\n",
      "    a training set and a validation set.\n",
      "    \"\"\"\n",
      "    # randomly shuffle the feature sets\n",
      "    random.shuffle(feature_sets)\n",
      " \n",
      "    # get the number of data points that we have\n",
      "    count = len(feature_sets)\n",
      "    # 20% of the set, also called \"corpus\", should be training, as a rule of thumb, but not gospel.\n",
      " \n",
      "    # we'll slice this list 20% the way through\n",
      "    slicing_point = int(.20 * count)\n",
      " \n",
      "    # the training set will be the first segment\n",
      "    training_set = feature_sets[:slicing_point]\n",
      " \n",
      "    # the validation set will be the second segment\n",
      "    validation_set = feature_sets[slicing_point:]\n",
      "    return training_set, validation_set\n",
      " \n",
      "def run_classification(training_set, validation_set):\n",
      "    # train the NaiveBayesClassifier on the training_set\n",
      "    classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
      "    # let's see how accurate it was\n",
      "    accuracy = nltk.classify.accuracy(classifier, validation_set)\n",
      "    print \"The accuracy was.... {}\".format(accuracy)\n",
      "    return classifier\n",
      " \n",
      "def predict(classifier, new_sms):\n",
      "    \"\"\"\n",
      "    Given a trained classifier and a fresh data point (a SMS),\n",
      "    this will predict its label, either 0 or 1.\n",
      "    \"\"\"\n",
      "    return classifier.classify(twitter_features(new_sms))\n",
      " \n",
      " \n",
      "# Now let's use the above functions to run our program\n",
      "start_time = time.time()\n",
      " \n",
      "print \"Let's use Naive Bayes!\"\n",
      " \n",
      "our_feature_sets = get_feature_sets()\n",
      "our_training_set, our_validation_set = get_training_and_validation_sets(our_feature_sets)\n",
      "print \"Size of our data set: {}\".format(len(our_feature_sets))\n",
      " \n",
      "print \"Now training the classifier and testing the accuracy...\"\n",
      "classifier = run_classification(our_training_set, our_validation_set)\n",
      " \n",
      "end_time = time.time()\n",
      "completion_time = end_time - start_time\n",
      "print \"It took {} seconds to run the algorithm\".format(completion_time)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "test\n",
        "Let's use Naive Bayes!\n",
        "Size of our data set: 5563\n",
        "Now training the classifier and testing the accuracy...\n",
        "The accuracy was.... 0.870590878454"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "It took 0.120777130127 seconds to run the algorithm\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}