Advertisement
Guest User

Untitled

a guest
May 29th, 2015
297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.68 KB | None | 0 0
  1. from nltk.probability import DictionaryProbDist
  2. from nltk import NaiveBayesClassifier
  3. from nltk import FreqDist, ConditionalFreqDist
  4. from nltk import BigramAssocMeasures
  5.  
  6. train_samples = {
  7. 'I hate you and you are a bad person': 'neg',
  8. 'I love you and you are a good person': 'pos',
  9. 'I fail at everything and I want to kill people' : 'neg',
  10. 'I win at everything and I want to love people' : 'pos',
  11. 'sad are things are heppening. fml' : 'neg',
  12. 'good are things are heppening. gbu' : 'pos',
  13. 'I am so poor' : 'neg',
  14. 'I am so rich' : 'pos',
  15. 'I hate you mommy ! You are my terrible person' : 'neg',
  16. 'I love you mommy ! You are my amazing person' : 'pos',
  17. 'I want to kill butterflies since they make me sad' : 'neg',
  18. 'I want to chase butterflies since they make me happy' : 'pos',
  19. 'I want to hurt bunnies' : 'neg',
  20. 'I want to hug bunnies' : 'pos',
  21. 'You make me frown' : 'neg',
  22. 'You make me smile' : 'pos',
  23. }
  24.  
  25. word_fd = FreqDist()
  26. label_word_fd = ConditionalFreqDist()
  27.  
  28. for words, label in train_samples.items():
  29. for word in words.split():
  30. word_fd.inc(word.lower())
  31. label_word_fd[label].inc(word.lower())
  32.  
  33. print word_fd
  34. print label_word_fd
  35.  
  36. pos_word_count = label_word_fd['pos'].N()
  37. neg_word_count = label_word_fd['neg'].N()
  38. total_word_count = pos_word_count + neg_word_count
  39.  
  40. print 'pos word count', pos_word_count
  41. print 'neg word count', neg_word_count
  42. print 'total word count', total_word_count
  43.  
  44. word_scores = {}
  45.  
  46. for word, freq in word_fd.iteritems():
  47. pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
  48. (freq, pos_word_count), total_word_count)
  49. neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
  50. (freq, neg_word_count), total_word_count)
  51. word_scores[word] = pos_score + neg_score
  52.  
  53. print 'word scores', word_scores
  54.  
  55. best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:100]
  56. bestwords = set([w for w, s in best])
  57.  
  58. print '*' * 50
  59. print 'best words', bestwords
  60. print '*' * 50
  61.  
  62. def best_word_feats(words):
  63. return dict([(word, True) for word in words if word in bestwords])
  64.  
  65.  
  66. test_samples = [
  67. 'You are a terrible person and everything you do is bad',
  68. 'I love you all and you make me happy',
  69. 'I frown whenever I see you in a poor state of mind',
  70. 'Finally getting rich from my ideas. They make me smile.',
  71. 'My mommy is poor',
  72. 'I love butterflies. Yay for happy',
  73. 'Everything is fail today and I hate stuff',
  74. ]
  75.  
  76.  
  77. def gen_bow(text):
  78. words = text.split()
  79. bow = {}
  80. for word in words:
  81. bow[word.lower()] = True
  82. return bow
  83.  
  84. label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
  85.  
  86. true_probdist = DictionaryProbDist({True: 6})
  87.  
  88. feature_probdist = { ## need to generate this from train_samples
  89. ('neg', 'no'): true_probdist,
  90. ('neg', 'hate'): true_probdist,
  91. ('neg', 'fml'): true_probdist,
  92. ('neg', 'poor'): true_probdist,
  93. ('neg', 'sad'): true_probdist,
  94. ('neg', 'fail'): true_probdist,
  95. ('neg', 'kill'): true_probdist,
  96. ('neg', 'evil'): true_probdist,
  97. ('pos', 'bunnies'): true_probdist,
  98. ('pos', 'butteryfly'): true_probdist,
  99. ('pos', 'pony'): true_probdist,
  100. ('pos', 'love'): true_probdist,
  101. ('pos', 'smile'): true_probdist,
  102. ('pos', 'happy'): true_probdist,
  103. ('pos', 'amazing'): true_probdist,
  104. ('pos', 'yes'): true_probdist,
  105. }
  106.  
  107.  
  108. classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
  109.  
  110. for sample in test_samples:
  111. print "%s | %s | %s" % (sample, classifier.classify(gen_bow(sample)), classifier.prob_classify(gen_bow(sample)))
  112.  
  113. classifier.show_most_informative_features()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement