Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

Jim Plush

By: a guest on Nov 30th, 2010  |  syntax: Python  |  size: 3.20 KB  |  views: 3,330  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1.  
  2. from nltk.corpus import stopwords
  3. from nltk.stem import PorterStemmer
  4. from nltk.tokenize import WordPunctTokenizer
  5. from nltk.collocations import BigramCollocationFinder
  6. from nltk.metrics import BigramAssocMeasures
  7. from nltk.classify import NaiveBayesClassifier
  8. from nltk.classify.util import accuracy
  9. import sys
  10. import urllib2
  11.  
  12. def extract_words(text):
  13.     '''
  14.    here we are extracting features to use in our classifier. We want to pull all the words in our input
  15.    porterstem them and grab the most significant bigrams to add to the mix as well.
  16.    '''
  17.  
  18.     stemmer = PorterStemmer()
  19.  
  20.     tokenizer = WordPunctTokenizer()
  21.     tokens = tokenizer.tokenize(text)
  22.  
  23.     bigram_finder = BigramCollocationFinder.from_words(tokens)
  24.     bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
  25.  
  26.     for bigram_tuple in bigrams:
  27.         x = "%s %s" % bigram_tuple
  28.         tokens.append(x)
  29.  
  30.     result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
  31.     return result
  32.  
  33. def get_feature(word):
  34.     return dict([(word, True)])
  35.  
  36.  
  37. def bag_of_words(words):
  38.     return dict([(word, True) for word in words])
  39.  
  40.  
  41. def create_training_dict(text, sense):
  42.     ''' returns a dict ready for a classifier's test method '''
  43.    tokens = extract_words(text)
  44.    return [(bag_of_words(tokens), sense)]
  45.  
  46.  
  47.  
  48. def run_classifier_tests(classifier):
  49.    testfiles = [{'fruit': 'http://litfuel.net/plush/files/disambiguation/apple-fruit-training.txt'},
  50.                 {'company': 'http://litfuel.net/plush/files/disambiguation/apple-company-training.txt'}]
  51.    testfeats = []
  52.    for file in testfiles:
  53.        for sense, loc in file.iteritems():
  54.            for line in urllib2.urlopen(loc):
  55.                testfeats = testfeats + create_training_dict(line, sense)
  56.  
  57.  
  58.    acc = accuracy(classifier, testfeats) * 100
  59.    print 'accuracy: %.2f%%' % acc
  60.  
  61.    sys.exit()
  62.  
  63.  
  64. if __name__ == '__main__':
  65.  
  66.    # create our dict of training data
  67.    texts = {}
  68.    texts['fruit'] = 'http://litfuel.net/plush/files/disambiguation/apple-fruit.txt'
  69.    texts['company'] = 'http://litfuel.net/plush/files/disambiguation/apple-company.txt'
  70.  
  71.    #holds a dict of features for training our classifier
  72.    train_set = []
  73.  
  74.    # loop through each item, grab the text, tokenize it and create a training feature with it
  75.    for sense, file in texts.iteritems():
  76.        print "training %s " % sense
  77.        text = urllib2.urlopen(file, 'r').read()
  78.        features = extract_words(text)
  79.        train_set = train_set + [(get_feature(word), sense) for word in features]
  80.  
  81.  
  82.    classifier = NaiveBayesClassifier.train(train_set)
  83.  
  84.    # uncomment out this line to see the most informative words the classifier will use
  85.    #classifier.show_most_informative_features(20)
  86.  
  87.  
  88.    # uncomment out this line to see how well our accuracy is using some hand curated tweets
  89.    #run_classifier_tests(classifier)
  90.  
  91.  
  92.    for line in urllib2.urlopen("http://litfuel.net/plush/files/disambiguation/apple-tweets.txt", 'r'):
  93.  
  94.        tokens = bag_of_words(extract_words(line))
  95.        decision = classifier.classify(tokens)
  96.        result = "%s - %s" % (decision,line )
  97.        print result