Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.corpus import stopwords
- from nltk.stem import PorterStemmer
- from nltk.tokenize import WordPunctTokenizer
- from nltk.collocations import BigramCollocationFinder
- from nltk.metrics import BigramAssocMeasures
- from nltk.classify import NaiveBayesClassifier
- from nltk.classify.util import accuracy
- import sys
- import urllib2
- def extract_words(text):
- '''
- here we are extracting features to use in our classifier. We want to pull all the words in our input
- porterstem them and grab the most significant bigrams to add to the mix as well.
- '''
- stemmer = PorterStemmer()
- tokenizer = WordPunctTokenizer()
- tokens = tokenizer.tokenize(text)
- bigram_finder = BigramCollocationFinder.from_words(tokens)
- bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
- for bigram_tuple in bigrams:
- x = "%s %s" % bigram_tuple
- tokens.append(x)
- result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
- return result
- def get_feature(word):
- return dict([(word, True)])
- def bag_of_words(words):
- return dict([(word, True) for word in words])
- def create_training_dict(text, sense):
- ''' returns a dict ready for a classifier's test method '''
- tokens = extract_words(text)
- return [(bag_of_words(tokens), sense)]
- def run_classifier_tests(classifier):
- testfiles = [{'fruit': 'http://litfuel.net/plush/files/disambiguation/apple-fruit-training.txt'},
- {'company': 'http://litfuel.net/plush/files/disambiguation/apple-company-training.txt'}]
- testfeats = []
- for file in testfiles:
- for sense, loc in file.iteritems():
- for line in urllib2.urlopen(loc):
- testfeats = testfeats + create_training_dict(line, sense)
- acc = accuracy(classifier, testfeats) * 100
- print 'accuracy: %.2f%%' % acc
- sys.exit()
- if __name__ == '__main__':
- # create our dict of training data
- texts = {}
- texts['fruit'] = 'http://litfuel.net/plush/files/disambiguation/apple-fruit.txt'
- texts['company'] = 'http://litfuel.net/plush/files/disambiguation/apple-company.txt'
- #holds a dict of features for training our classifier
- train_set = []
- # loop through each item, grab the text, tokenize it and create a training feature with it
- for sense, file in texts.iteritems():
- print "training %s " % sense
- text = urllib2.urlopen(file, 'r').read()
- features = extract_words(text)
- train_set = train_set + [(get_feature(word), sense) for word in features]
- classifier = NaiveBayesClassifier.train(train_set)
- # uncomment out this line to see the most informative words the classifier will use
- #classifier.show_most_informative_features(20)
- # uncomment out this line to see how well our accuracy is using some hand curated tweets
- #run_classifier_tests(classifier)
- for line in urllib2.urlopen("http://litfuel.net/plush/files/disambiguation/apple-tweets.txt", 'r'):
- tokens = bag_of_words(extract_words(line))
- decision = classifier.classify(tokens)
- result = "%s - %s" % (decision,line )
- print result
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement