Advertisement
Guest User

Untitled

a guest
Apr 25th, 2015
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.86 KB | None | 0 0
  1. {
  2. "metadata": {
  3. "name": "",
  4. "signature": "sha256:6bdb7954cfe1c233e9b8cf2e3978cb41700c561f90d847e1fba49089cbba4b0f"
  5. },
  6. "nbformat": 3,
  7. "nbformat_minor": 0,
  8. "worksheets": [
  9. {
  10. "cells": [
  11. {
  12. "cell_type": "code",
  13. "collapsed": false,
  14. "input": [
  15. "# built-in python library for interacting with .csv files\n",
  16. "import csv\n",
  17. "# natural language toolkit is a great python library for natural language processing\n",
  18. "import nltk\n",
  19. "# built-in python library for utility functions that introduce randomness\n",
  20. "import random\n",
  21. "# built-in python library for measuring time-related things\n",
  22. "import time\n",
  23. " \n",
  24. "print \"test\"\n",
  25. " \n",
  26. "def get_length_bucket(sms_length):\n",
  27. " \"\"\"\n",
  28. " buckets the SMS length into either short / medium / long\n",
  29. " \"\"\"\n",
  30. " if sms_length < 20:\n",
  31. " return \"short\"\n",
  32. " elif sms_length < 50:\n",
  33. " return \"medium\"\n",
  34. " else:\n",
  35. " return \"long\"\n",
  36. " \n",
  37. " \n",
  38. "def sms_features(sms):\n",
  39. " \"\"\"\n",
  40. " Returns a dictionary of the features of the sms we want our model\n",
  41. " to be based on, e.g. sms_length.\n",
  42. " \n",
  43. " So if the tweet was \"Hey!\", the output of this function would be\n",
  44. " {\n",
  45. " \"length\": \"short\"\n",
  46. " }\n",
  47. " \n",
  48. " If the tweet was \"Hey this is a really great idea and I think that we should totally implement this technique\",\n",
  49. " then the output would be\n",
  50. " {\n",
  51. " \"length\": \"long\"\n",
  52. " }\n",
  53. " \"\"\"\n",
  54. " return {\n",
  55. " \"length\": get_length_bucket(len(sms)),\n",
  56. " 'contains_reply': 'reply' in sms,\n",
  57. " 'contains_winner': 'winner' in sms,\n",
  58. " \n",
  59. " }\n",
  60. " \n",
  61. " \n",
  62. " \n",
  63. "def get_feature_sets():\n",
  64. " \"\"\"\n",
  65. " # Step 1: This reads in the rows from the csv file which look like this:\n",
  66. " 0, I'm so sad\n",
  67. " 1, Happy!\n",
  68. " \n",
  69. " where the first row is the label; 0=negative, 1=positive\n",
  70. " and the second row is the body of the SMS\n",
  71. " \n",
  72. " # Step 2: Turn the csv rows into feature dictionaries using `sms_features` function above.\n",
  73. " \n",
  74. " The output of this function run on the example in Step 1 will look like this:\n",
  75. " [\n",
  76. " ({\"length\": \"short\"}, 0), # this corresponds to 0, I'm so sad\n",
  77. " ({\"length\": \"short\"}, 1) # this corresponds to 1, Happy!\n",
  78. " ]\n",
  79. " \n",
  80. " You can think about this more abstractly as this:\n",
  81. " [\n",
  82. " (feature_dictionary, label), # corresponding to row 0\n",
  83. " ... # corresponding to row 1 ... n\n",
  84. " ]\n",
  85. " \"\"\"\n",
  86. " # open the file, which we've placed at /home/vagrant/repos/datasets/sms_spam_or_ham.csv\n",
  87. " # 'rb' means read-only mode and binary encoding\n",
  88. " f = open('/home/vagrant/repos/datasets/sms_spam_or_ham.csv', 'rb')\n",
  89. " \n",
  90. " # let's read in the rows from the csv file\n",
  91. " rows = []\n",
  92. " for row in csv.reader(f):\n",
  93. " rows.append(row)\n",
  94. " \n",
  95. " # now let's generate the output that we specified in the comments above\n",
  96. " output_data = []\n",
  97. " \n",
  98. " # let's just run it on 100,000 rows first, instead of all 1.5 million rows\n",
  99. " # when you experiment with the `sms_features` function to improve accuracy\n",
  100. " # feel free to get rid of the row limit and just run it on the whole set\n",
  101. " for row in rows[:100000]:\n",
  102. " # Remember that row[0] is the label, either 0 or 1\n",
  103. " # and row[1] is the tweet body\n",
  104. " \n",
  105. " # get the label\n",
  106. " label = row[0]\n",
  107. " \n",
  108. " # get the tweet body and compute the feature dictionary\n",
  109. " try:\n",
  110. " feature_dict = sms_features(row[1])\n",
  111. " except:\n",
  112. " continue\n",
  113. " # add the tuple of feature_dict, label to output_data\n",
  114. " data = (feature_dict, label)\n",
  115. " output_data.append(data)\n",
  116. " \n",
  117. " # close the file\n",
  118. " f.close()\n",
  119. " return output_data\n",
  120. " \n",
  121. "def get_training_and_validation_sets(feature_sets):\n",
  122. " \"\"\"\n",
  123. " This takes the output of `get_feature_sets`, randomly shuffles it to ensure we're\n",
  124. " taking an unbiased sample, and then splits the set of features into\n",
  125. " a training set and a validation set.\n",
  126. " \"\"\"\n",
  127. " # randomly shuffle the feature sets\n",
  128. " random.shuffle(feature_sets)\n",
  129. " \n",
  130. " # get the number of data points that we have\n",
  131. " count = len(feature_sets)\n",
  132. " # 20% of the set, also called \"corpus\", should be training, as a rule of thumb, but not gospel.\n",
  133. " \n",
  134. " # we'll slice this list 20% the way through\n",
  135. " slicing_point = int(.20 * count)\n",
  136. " \n",
  137. " # the training set will be the first segment\n",
  138. " training_set = feature_sets[:slicing_point]\n",
  139. " \n",
  140. " # the validation set will be the second segment\n",
  141. " validation_set = feature_sets[slicing_point:]\n",
  142. " return training_set, validation_set\n",
  143. " \n",
  144. "def run_classification(training_set, validation_set):\n",
  145. " # train the NaiveBayesClassifier on the training_set\n",
  146. " classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
  147. " # let's see how accurate it was\n",
  148. " accuracy = nltk.classify.accuracy(classifier, validation_set)\n",
  149. " print \"The accuracy was.... {}\".format(accuracy)\n",
  150. " return classifier\n",
  151. " \n",
  152. "def predict(classifier, new_sms):\n",
  153. " \"\"\"\n",
  154. " Given a trained classifier and a fresh data point (a SMS),\n",
  155. " this will predict its label, either 0 or 1.\n",
  156. " \"\"\"\n",
  157. " return classifier.classify(twitter_features(new_sms))\n",
  158. " \n",
  159. " \n",
  160. "# Now let's use the above functions to run our program\n",
  161. "start_time = time.time()\n",
  162. " \n",
  163. "print \"Let's use Naive Bayes!\"\n",
  164. " \n",
  165. "our_feature_sets = get_feature_sets()\n",
  166. "our_training_set, our_validation_set = get_training_and_validation_sets(our_feature_sets)\n",
  167. "print \"Size of our data set: {}\".format(len(our_feature_sets))\n",
  168. " \n",
  169. "print \"Now training the classifier and testing the accuracy...\"\n",
  170. "classifier = run_classification(our_training_set, our_validation_set)\n",
  171. " \n",
  172. "end_time = time.time()\n",
  173. "completion_time = end_time - start_time\n",
  174. "print \"It took {} seconds to run the algorithm\".format(completion_time)"
  175. ],
  176. "language": "python",
  177. "metadata": {},
  178. "outputs": [
  179. {
  180. "output_type": "stream",
  181. "stream": "stdout",
  182. "text": [
  183. "test\n",
  184. "Let's use Naive Bayes!\n",
  185. "Size of our data set: 5563\n",
  186. "Now training the classifier and testing the accuracy...\n",
  187. "The accuracy was.... 0.870590878454"
  188. ]
  189. },
  190. {
  191. "output_type": "stream",
  192. "stream": "stdout",
  193. "text": [
  194. "\n",
  195. "It took 0.120777130127 seconds to run the algorithm\n"
  196. ]
  197. }
  198. ],
  199. "prompt_number": 2
  200. },
  201. {
  202. "cell_type": "code",
  203. "collapsed": false,
  204. "input": [],
  205. "language": "python",
  206. "metadata": {},
  207. "outputs": []
  208. }
  209. ],
  210. "metadata": {}
  211. }
  212. ]
  213. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement