Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "metadata": {
- "name": "",
- "signature": "sha256:6bdb7954cfe1c233e9b8cf2e3978cb41700c561f90d847e1fba49089cbba4b0f"
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
- {
- "cells": [
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# built-in python library for interacting with .csv files\n",
- "import csv\n",
- "# natural language toolkit is a great python library for natural language processing\n",
- "import nltk\n",
- "# built-in python library for utility functions that introduce randomness\n",
- "import random\n",
- "# built-in python library for measuring time-related things\n",
- "import time\n",
- " \n",
- "print \"test\"\n",
- " \n",
- "def get_length_bucket(sms_length):\n",
- " \"\"\"\n",
- " buckets the SMS length into either short / medium / long\n",
- " \"\"\"\n",
- " if sms_length < 20:\n",
- " return \"short\"\n",
- " elif sms_length < 50:\n",
- " return \"medium\"\n",
- " else:\n",
- " return \"long\"\n",
- " \n",
- " \n",
- "def sms_features(sms):\n",
- " \"\"\"\n",
- " Returns a dictionary of the features of the sms we want our model\n",
- " to be based on, e.g. sms_length.\n",
- " \n",
- " So if the tweet was \"Hey!\", the output of this function would be\n",
- " {\n",
- " \"length\": \"short\"\n",
- " }\n",
- " \n",
- " If the tweet was \"Hey this is a really great idea and I think that we should totally implement this technique\",\n",
- " then the output would be\n",
- " {\n",
- " \"length\": \"long\"\n",
- " }\n",
- " \"\"\"\n",
- " return {\n",
- " \"length\": get_length_bucket(len(sms)),\n",
- " 'contains_reply': 'reply' in sms,\n",
- " 'contains_winner': 'winner' in sms,\n",
- " \n",
- " }\n",
- " \n",
- " \n",
- " \n",
- "def get_feature_sets():\n",
- " \"\"\"\n",
- " # Step 1: This reads in the rows from the csv file which look like this:\n",
- " 0, I'm so sad\n",
- " 1, Happy!\n",
- " \n",
- " where the first row is the label; 0=negative, 1=positive\n",
- " and the second row is the body of the SMS\n",
- " \n",
- " # Step 2: Turn the csv rows into feature dictionaries using `sms_features` function above.\n",
- " \n",
- " The output of this function run on the example in Step 1 will look like this:\n",
- " [\n",
- " ({\"length\": \"short\"}, 0), # this corresponds to 0, I'm so sad\n",
- " ({\"length\": \"short\"}, 1) # this corresponds to 1, Happy!\n",
- " ]\n",
- " \n",
- " You can think about this more abstractly as this:\n",
- " [\n",
- " (feature_dictionary, label), # corresponding to row 0\n",
- " ... # corresponding to row 1 ... n\n",
- " ]\n",
- " \"\"\"\n",
- " # open the file, which we've placed at /home/vagrant/repos/datasets/sms_spam_or_ham.csv\n",
- " # 'rb' means read-only mode and binary encoding\n",
- " f = open('/home/vagrant/repos/datasets/sms_spam_or_ham.csv', 'rb')\n",
- " \n",
- " # let's read in the rows from the csv file\n",
- " rows = []\n",
- " for row in csv.reader(f):\n",
- " rows.append(row)\n",
- " \n",
- " # now let's generate the output that we specified in the comments above\n",
- " output_data = []\n",
- " \n",
- " # let's just run it on 100,000 rows first, instead of all 1.5 million rows\n",
- " # when you experiment with the `sms_features` function to improve accuracy\n",
- " # feel free to get rid of the row limit and just run it on the whole set\n",
- " for row in rows[:100000]:\n",
- " # Remember that row[0] is the label, either 0 or 1\n",
- " # and row[1] is the tweet body\n",
- " \n",
- " # get the label\n",
- " label = row[0]\n",
- " \n",
- " # get the tweet body and compute the feature dictionary\n",
- " try:\n",
- " feature_dict = sms_features(row[1])\n",
- " except:\n",
- " continue\n",
- " # add the tuple of feature_dict, label to output_data\n",
- " data = (feature_dict, label)\n",
- " output_data.append(data)\n",
- " \n",
- " # close the file\n",
- " f.close()\n",
- " return output_data\n",
- " \n",
- "def get_training_and_validation_sets(feature_sets):\n",
- " \"\"\"\n",
- " This takes the output of `get_feature_sets`, randomly shuffles it to ensure we're\n",
- " taking an unbiased sample, and then splits the set of features into\n",
- " a training set and a validation set.\n",
- " \"\"\"\n",
- " # randomly shuffle the feature sets\n",
- " random.shuffle(feature_sets)\n",
- " \n",
- " # get the number of data points that we have\n",
- " count = len(feature_sets)\n",
- " # 20% of the set, also called \"corpus\", should be training, as a rule of thumb, but not gospel.\n",
- " \n",
- " # we'll slice this list 20% the way through\n",
- " slicing_point = int(.20 * count)\n",
- " \n",
- " # the training set will be the first segment\n",
- " training_set = feature_sets[:slicing_point]\n",
- " \n",
- " # the validation set will be the second segment\n",
- " validation_set = feature_sets[slicing_point:]\n",
- " return training_set, validation_set\n",
- " \n",
- "def run_classification(training_set, validation_set):\n",
- " # train the NaiveBayesClassifier on the training_set\n",
- " classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
- " # let's see how accurate it was\n",
- " accuracy = nltk.classify.accuracy(classifier, validation_set)\n",
- " print \"The accuracy was.... {}\".format(accuracy)\n",
- " return classifier\n",
- " \n",
- "def predict(classifier, new_sms):\n",
- " \"\"\"\n",
- " Given a trained classifier and a fresh data point (a SMS),\n",
- " this will predict its label, either 0 or 1.\n",
- " \"\"\"\n",
- " return classifier.classify(twitter_features(new_sms))\n",
- " \n",
- " \n",
- "# Now let's use the above functions to run our program\n",
- "start_time = time.time()\n",
- " \n",
- "print \"Let's use Naive Bayes!\"\n",
- " \n",
- "our_feature_sets = get_feature_sets()\n",
- "our_training_set, our_validation_set = get_training_and_validation_sets(our_feature_sets)\n",
- "print \"Size of our data set: {}\".format(len(our_feature_sets))\n",
- " \n",
- "print \"Now training the classifier and testing the accuracy...\"\n",
- "classifier = run_classification(our_training_set, our_validation_set)\n",
- " \n",
- "end_time = time.time()\n",
- "completion_time = end_time - start_time\n",
- "print \"It took {} seconds to run the algorithm\".format(completion_time)"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "test\n",
- "Let's use Naive Bayes!\n",
- "Size of our data set: 5563\n",
- "Now training the classifier and testing the accuracy...\n",
- "The accuracy was.... 0.870590878454"
- ]
- },
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "\n",
- "It took 0.120777130127 seconds to run the algorithm\n"
- ]
- }
- ],
- "prompt_number": 2
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [],
- "language": "python",
- "metadata": {},
- "outputs": []
- }
- ],
- "metadata": {}
- }
- ]
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement