Part-of-Speech tagging based on Soundex features. Take a look at python code below… Source Code =========== import re import nltk from nltk.corpus import brown from nltk.tag.sequential import * def soundex(text): first = text[0].upper() text.lower() text = re.sub('[bfpv]+','1',text) text = re.sub('[cgjkqsxz]+','2',text) text = re.sub('[dt]+','3',text) text = re.sub('[l]+','4',text) text = re.sub('[mn]+','5',text) text = re.sub('[r]+','6',text) for c in '123456': text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text) result = first + re.sub('[^0-9]+','',text[1:]) result = result[:4] + '0000'[:4-len(result[:4])] return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger): def feature_detector(self, tokens, index, history): features = super(SoundexBasedPOSTagger, self).feature_detector( tokens, index, history) features['soundex'] = soundex(tokens[index]) return features brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] soundex_tagger = SoundexBasedPOSTagger(train=train_sents) print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents) tagger = ClassifierBasedPOSTagger(train=train_sents) print "Accuracy (without soundex):", tagger.evaluate(test_sents) Result ====== >>> ... Accuracy (with soundex): 0.889763779528 Accuracy (without soundex): 0.887571015648 As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set. Reference ========= [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office. Credit ====== Kanokwut Thanadkarn Appendix ======== Ten-fold evaluation result on all brown corpus categories. =========================================================== Category Unigram Bigram Trigram Classi. Soundex mystery 85.0129 86.5653 86.4222 90.0662 90.3430 belles_lettres 86.2852 87.5612 87.5470 90.3835 90.5070 humor 80.1900 80.5058 80.4568 86.2681 86.6681 government 83.0616 85.0469 84.9953 86.8605 87.1182 fiction 84.9116 85.9910 85.9247 89.7749 90.1300 reviews 79.5728 80.3393 80.3386 86.2385 86.4775 religion 82.0312 82.9264 82.7888 86.5820 87.0057 romance 85.2516 86.6441 86.5653 90.0606 90.3025 science_fiction 81.3618 81.7111 81.5965 86.3424 86.9700 adventure 85.1166 86.4087 86.2206 90.2558 90.6666 editorial 82.6672 83.8054 83.6872 87.4071 87.7115 hobbies 82.2784 83.1417 83.0963 86.1888 86.4665 lore 84.4681 85.4592 85.4197 88.8270 89.0992 news 83.2413 84.2187 84.0862 88.2912 88.4107 learned 86.4339 87.5311 87.4997 88.9631 88.9764 Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569 =========================================================== [Source Code] import re import nltk import pickle import os.path from nltk.corpus import brown from nltk.tag.sequential import * def soundex(text): if not text.isalpha(): return None first = text[0].upper() text.lower() text = re.sub(‘[bfpv]+’,’1′,text) text = re.sub(‘[cgjkqsxz]+’,’2′,text) text = re.sub(‘[dt]+’,’3′,text) text = re.sub(‘[l]+’,’4′,text) text = re.sub(‘[mn]+’,’5′,text) text = re.sub(‘[r]+’,’6′,text) for c in ’123456′: text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6])’,r’\1\2\4′,text) result = first + re.sub(‘[^0-9]+’,”,text[1:]) result = result[:4] + ’0000′[:4-len(result[:4])] return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger): def feature_detector(self, tokens, index, history): features = super(SoundexBasedPOSTagger, self).feature_detector( tokens, index, history) features['soundex'] = soundex(tokens[index]) return features def kfold(data, r, k=10): fsize = len(data) / float(k) test_start = int(fsize * r) test_end = int(fsize * (r+1)) test = [x for x in data[test_start:test_end]] train = [x for i,x in enumerate(data) if i not in range(test_start, test_end)] return train, test def evaluate(sents): score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]} for i in range(10): train_sents, test_sents = kfold(sents, i) if i == 0: print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” t0 = nltk.DefaultTagger(‘NN’) t1 = nltk.UnigramTagger(train_sents, backoff=t0) score['unigram'].append(t1.evaluate(test_sents)) t2 = nltk.BigramTagger(train_sents, backoff=t1) score['bigram'].append(t2.evaluate(test_sents)) t3 = nltk.TrigramTagger(train_sents, backoff=t2) score['trigram'].append(t3.evaluate(test_sents)) classifier_tagger = ClassifierBasedPOSTagger(train=train_sents) score['classifier'].append(classifier_tagger.evaluate(test_sents)) soundex_tagger = SoundexBasedPOSTagger(train=train_sents) score['soundex'].append(soundex_tagger.evaluate(test_sents)) print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i, score['unigram'][i], score['bigram'][i], score['trigram'][i], score['classifier'][i], score['soundex'][i]) print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % ( sum(score['unigram']) / 10, sum(score['bigram']) / 10, sum(score['trigram']) / 10, sum(score['classifier']) / 10, sum(score['soundex']) / 10) return score result = {} skip_categories = { ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False, ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False, ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False, ’reviews’:False, ’romance’:False, ’science_fiction’:False} for c in brown.categories(): print ”category :”, c if skip_categories[c]: print ”*** skipped ***” continue obj_file = ”result.brown.%s” % c if os.path.isfile(obj_file): result[c] = pickle.load(file(obj_file, ’r')) for i in range(10): if i == 0: print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i, result[c]['unigram'][i], result[c]['bigram'][i], result[c]['trigram'][i], result[c]['classifier'][i], result[c]['soundex'][i]) print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % ( sum(result[c]['unigram']) / 10, sum(result[c]['bigram']) / 10, sum(result[c]['trigram']) / 10, sum(result[c]['classifier']) / 10, sum(result[c]['soundex']) / 10) else: tagged_corpus = brown.tagged_sents(categories=c) result[c] = evaluate(tagged_corpus) pickle.dump(result[c], file(obj_file, ’w')) sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]} print ”Summary Score…” print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” for c in result.keys(): print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c, sum(result[c]['unigram']) / 10 * 100, sum(result[c]['bigram']) / 10 * 100, sum(result[c]['trigram']) / 10 * 100, sum(result[c]['classifier']) / 10 * 100, sum(result[c]['soundex']) / 10 * 100) sum_score['unigram'].append(sum(result[c]['unigram']) / 10) sum_score['bigram'].append(sum(result[c]['bigram']) / 10) sum_score['trigram'].append(sum(result[c]['trigram']) / 10) sum_score['classifier'].append(sum(result[c]['classifier']) / 10) sum_score['soundex'].append(sum(result[c]['soundex']) / 10) print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % ( sum(sum_score['unigram']) / len(result.keys()) * 100, sum(sum_score['bigram']) / len(result.keys()) * 100, sum(sum_score['trigram']) / len(result.keys()) * 100, sum(sum_score['classifier']) / len(result.keys()) * 100, sum(sum_score['soundex']) / len(result.keys()) * 100)