Part-of-Speech tagging based on Soundex features. Take a look at python code below…
Source Code
===========
import re
import nltk
from nltk.corpus import brown
from nltk.tag.sequential import * def soundex(text):
first = text[0].upper()
text.lower()
text = re.sub('[bfpv]+','1',text)
text = re.sub('[cgjkqsxz]+','2',text)
text = re.sub('[dt]+','3',text)
text = re.sub('[l]+','4',text)
text = re.sub('[mn]+','5',text)
text = re.sub('[r]+','6',text)
for c in '123456':
text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text)
result = first + re.sub('[^0-9]+','',text[1:])
result = result[:4] + '0000'[:4-len(result[:4])]
return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
def feature_detector(self, tokens, index, history):
features = super(SoundexBasedPOSTagger, self).feature_detector(
tokens, index, history)
features['soundex'] = soundex(tokens[index])
return features brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents)
tagger = ClassifierBasedPOSTagger(train=train_sents)
print "Accuracy (without soundex):", tagger.evaluate(test_sents)
Result
======
>>>
...
Accuracy (with soundex): 0.889763779528
Accuracy (without soundex): 0.887571015648
As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set.
Reference
=========
[1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html
[2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office.
Credit
======
Kanokwut Thanadkarn
Appendix
========
Ten-fold evaluation result on all brown corpus categories.
===========================================================
Category Unigram Bigram Trigram Classi. Soundex
mystery 85.0129 86.5653 86.4222 90.0662 90.3430
belles_lettres 86.2852 87.5612 87.5470 90.3835 90.5070
humor 80.1900 80.5058 80.4568 86.2681 86.6681
government 83.0616 85.0469 84.9953 86.8605 87.1182
fiction 84.9116 85.9910 85.9247 89.7749 90.1300
reviews 79.5728 80.3393 80.3386 86.2385 86.4775
religion 82.0312 82.9264 82.7888 86.5820 87.0057
romance 85.2516 86.6441 86.5653 90.0606 90.3025
science_fiction 81.3618 81.7111 81.5965 86.3424 86.9700
adventure 85.1166 86.4087 86.2206 90.2558 90.6666
editorial 82.6672 83.8054 83.6872 87.4071 87.7115
hobbies 82.2784 83.1417 83.0963 86.1888 86.4665
lore 84.4681 85.4592 85.4197 88.8270 89.0992
news 83.2413 84.2187 84.0862 88.2912 88.4107
learned 86.4339 87.5311 87.4997 88.9631 88.9764
Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569
===========================================================
[Source Code]
import re
import nltk
import pickle
import os.path
from nltk.corpus import brown
from nltk.tag.sequential import *
def soundex(text):
if not text.isalpha():
return None
first = text[0].upper()
text.lower()
text = re.sub(‘[bfpv]+’,’1′,text)
text = re.sub(‘[cgjkqsxz]+’,’2′,text)
text = re.sub(‘[dt]+’,’3′,text)
text = re.sub(‘[l]+’,’4′,text)
text = re.sub(‘[mn]+’,’5′,text)
text = re.sub(‘[r]+’,’6′,text)
for c in ’123456′:
text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6])’,r’\1\2\4′,text)
result = first + re.sub(‘[^0-9]+’,”,text[1:])
result = result[:4] + ’0000′[:4-len(result[:4])]
return result
class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
def feature_detector(self, tokens, index, history):
features = super(SoundexBasedPOSTagger, self).feature_detector(
tokens, index, history)
features['soundex'] = soundex(tokens[index])
return features
def kfold(data, r, k=10):
fsize = len(data) / float(k)
test_start = int(fsize * r)
test_end = int(fsize * (r+1))
test = [x for x in data[test_start:test_end]]
train = [x for i,x in enumerate(data) if
i not in range(test_start, test_end)]
return train, test
def evaluate(sents):
score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
for i in range(10):
train_sents, test_sents = kfold(sents, i)
if i == 0:
print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
t0 = nltk.DefaultTagger(‘NN’)
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
score['unigram'].append(t1.evaluate(test_sents))
t2 = nltk.BigramTagger(train_sents, backoff=t1)
score['bigram'].append(t2.evaluate(test_sents))
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
score['trigram'].append(t3.evaluate(test_sents))
classifier_tagger = ClassifierBasedPOSTagger(train=train_sents)
score['classifier'].append(classifier_tagger.evaluate(test_sents))
soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
score['soundex'].append(soundex_tagger.evaluate(test_sents))
print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
score['unigram'][i],
score['bigram'][i],
score['trigram'][i],
score['classifier'][i],
score['soundex'][i])
print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
sum(score['unigram']) / 10,
sum(score['bigram']) / 10,
sum(score['trigram']) / 10,
sum(score['classifier']) / 10,
sum(score['soundex']) / 10)
return score
result = {}
skip_categories = {
’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False,
’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False,
’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False,
’reviews’:False, ’romance’:False, ’science_fiction’:False}
for c in brown.categories():
print ”category :”, c
if skip_categories[c]:
print ”*** skipped ***”
continue
obj_file = ”result.brown.%s” % c
if os.path.isfile(obj_file):
result[c] = pickle.load(file(obj_file, ’r'))
for i in range(10):
if i == 0:
print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
result[c]['unigram'][i],
result[c]['bigram'][i],
result[c]['trigram'][i],
result[c]['classifier'][i],
result[c]['soundex'][i])
print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
sum(result[c]['unigram']) / 10,
sum(result[c]['bigram']) / 10,
sum(result[c]['trigram']) / 10,
sum(result[c]['classifier']) / 10,
sum(result[c]['soundex']) / 10)
else:
tagged_corpus = brown.tagged_sents(categories=c)
result[c] = evaluate(tagged_corpus)
pickle.dump(result[c], file(obj_file, ’w'))
sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
print ”Summary Score…”
print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
for c in result.keys():
print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c,
sum(result[c]['unigram']) / 10 * 100,
sum(result[c]['bigram']) / 10 * 100,
sum(result[c]['trigram']) / 10 * 100,
sum(result[c]['classifier']) / 10 * 100,
sum(result[c]['soundex']) / 10 * 100)
sum_score['unigram'].append(sum(result[c]['unigram']) / 10)
sum_score['bigram'].append(sum(result[c]['bigram']) / 10)
sum_score['trigram'].append(sum(result[c]['trigram']) / 10)
sum_score['classifier'].append(sum(result[c]['classifier']) / 10)
sum_score['soundex'].append(sum(result[c]['soundex']) / 10)
print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % (
sum(sum_score['unigram']) / len(result.keys()) * 100,
sum(sum_score['bigram']) / len(result.keys()) * 100,
sum(sum_score['trigram']) / len(result.keys()) * 100,
sum(sum_score['classifier']) / len(result.keys()) * 100,
sum(sum_score['soundex']) / len(result.keys()) * 100)