Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Part-of-Speech tagging based on Soundex features. Take a look at python code below…
- Source Code
- ===========
- import re
- import nltk
- from nltk.corpus import brown
- from nltk.tag.sequential import * def soundex(text):
- first = text[0].upper()
- text.lower()
- text = re.sub('[bfpv]+','1',text)
- text = re.sub('[cgjkqsxz]+','2',text)
- text = re.sub('[dt]+','3',text)
- text = re.sub('[l]+','4',text)
- text = re.sub('[mn]+','5',text)
- text = re.sub('[r]+','6',text)
- for c in '123456':
- text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text)
- result = first + re.sub('[^0-9]+','',text[1:])
- result = result[:4] + '0000'[:4-len(result[:4])]
- return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
- def feature_detector(self, tokens, index, history):
- features = super(SoundexBasedPOSTagger, self).feature_detector(
- tokens, index, history)
- features['soundex'] = soundex(tokens[index])
- return features brown_tagged_sents = brown.tagged_sents(categories='news')
- brown_sents = brown.sents(categories='news')
- size = int(len(brown_tagged_sents) * 0.9)
- train_sents = brown_tagged_sents[:size]
- test_sents = brown_tagged_sents[size:]
- soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
- print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents)
- tagger = ClassifierBasedPOSTagger(train=train_sents)
- print "Accuracy (without soundex):", tagger.evaluate(test_sents)
- Result
- ======
- >>>
- ...
- Accuracy (with soundex): 0.889763779528
- Accuracy (without soundex): 0.887571015648
- As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set.
- Reference
- =========
- [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html
- [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office.
- Credit
- ======
- Kanokwut Thanadkarn
- Appendix
- ========
- Ten-fold evaluation result on all brown corpus categories.
- ===========================================================
- Category Unigram Bigram Trigram Classi. Soundex
- mystery 85.0129 86.5653 86.4222 90.0662 90.3430
- belles_lettres 86.2852 87.5612 87.5470 90.3835 90.5070
- humor 80.1900 80.5058 80.4568 86.2681 86.6681
- government 83.0616 85.0469 84.9953 86.8605 87.1182
- fiction 84.9116 85.9910 85.9247 89.7749 90.1300
- reviews 79.5728 80.3393 80.3386 86.2385 86.4775
- religion 82.0312 82.9264 82.7888 86.5820 87.0057
- romance 85.2516 86.6441 86.5653 90.0606 90.3025
- science_fiction 81.3618 81.7111 81.5965 86.3424 86.9700
- adventure 85.1166 86.4087 86.2206 90.2558 90.6666
- editorial 82.6672 83.8054 83.6872 87.4071 87.7115
- hobbies 82.2784 83.1417 83.0963 86.1888 86.4665
- lore 84.4681 85.4592 85.4197 88.8270 89.0992
- news 83.2413 84.2187 84.0862 88.2912 88.4107
- learned 86.4339 87.5311 87.4997 88.9631 88.9764
- Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569
- ===========================================================
- [Source Code]
- import re
- import nltk
- import pickle
- import os.path
- from nltk.corpus import brown
- from nltk.tag.sequential import *
- def soundex(text):
- if not text.isalpha():
- return None
- first = text[0].upper()
- text.lower()
- text = re.sub(‘[bfpv]+’,’1′,text)
- text = re.sub(‘[cgjkqsxz]+’,’2′,text)
- text = re.sub(‘[dt]+’,’3′,text)
- text = re.sub(‘[l]+’,’4′,text)
- text = re.sub(‘[mn]+’,’5′,text)
- text = re.sub(‘[r]+’,’6′,text)
- for c in ’123456′:
- text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6])’,r’\1\2\4′,text)
- result = first + re.sub(‘[^0-9]+’,”,text[1:])
- result = result[:4] + ’0000′[:4-len(result[:4])]
- return result
- class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
- def feature_detector(self, tokens, index, history):
- features = super(SoundexBasedPOSTagger, self).feature_detector(
- tokens, index, history)
- features['soundex'] = soundex(tokens[index])
- return features
- def kfold(data, r, k=10):
- fsize = len(data) / float(k)
- test_start = int(fsize * r)
- test_end = int(fsize * (r+1))
- test = [x for x in data[test_start:test_end]]
- train = [x for i,x in enumerate(data) if
- i not in range(test_start, test_end)]
- return train, test
- def evaluate(sents):
- score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
- for i in range(10):
- train_sents, test_sents = kfold(sents, i)
- if i == 0:
- print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
- t0 = nltk.DefaultTagger(‘NN’)
- t1 = nltk.UnigramTagger(train_sents, backoff=t0)
- score['unigram'].append(t1.evaluate(test_sents))
- t2 = nltk.BigramTagger(train_sents, backoff=t1)
- score['bigram'].append(t2.evaluate(test_sents))
- t3 = nltk.TrigramTagger(train_sents, backoff=t2)
- score['trigram'].append(t3.evaluate(test_sents))
- classifier_tagger = ClassifierBasedPOSTagger(train=train_sents)
- score['classifier'].append(classifier_tagger.evaluate(test_sents))
- soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
- score['soundex'].append(soundex_tagger.evaluate(test_sents))
- print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
- score['unigram'][i],
- score['bigram'][i],
- score['trigram'][i],
- score['classifier'][i],
- score['soundex'][i])
- print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
- sum(score['unigram']) / 10,
- sum(score['bigram']) / 10,
- sum(score['trigram']) / 10,
- sum(score['classifier']) / 10,
- sum(score['soundex']) / 10)
- return score
- result = {}
- skip_categories = {
- ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False,
- ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False,
- ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False,
- ’reviews’:False, ’romance’:False, ’science_fiction’:False}
- for c in brown.categories():
- print ”category :”, c
- if skip_categories[c]:
- print ”*** skipped ***”
- continue
- obj_file = ”result.brown.%s” % c
- if os.path.isfile(obj_file):
- result[c] = pickle.load(file(obj_file, ’r'))
- for i in range(10):
- if i == 0:
- print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
- print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
- result[c]['unigram'][i],
- result[c]['bigram'][i],
- result[c]['trigram'][i],
- result[c]['classifier'][i],
- result[c]['soundex'][i])
- print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
- sum(result[c]['unigram']) / 10,
- sum(result[c]['bigram']) / 10,
- sum(result[c]['trigram']) / 10,
- sum(result[c]['classifier']) / 10,
- sum(result[c]['soundex']) / 10)
- else:
- tagged_corpus = brown.tagged_sents(categories=c)
- result[c] = evaluate(tagged_corpus)
- pickle.dump(result[c], file(obj_file, ’w'))
- sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
- print ”Summary Score…”
- print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
- for c in result.keys():
- print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c,
- sum(result[c]['unigram']) / 10 * 100,
- sum(result[c]['bigram']) / 10 * 100,
- sum(result[c]['trigram']) / 10 * 100,
- sum(result[c]['classifier']) / 10 * 100,
- sum(result[c]['soundex']) / 10 * 100)
- sum_score['unigram'].append(sum(result[c]['unigram']) / 10)
- sum_score['bigram'].append(sum(result[c]['bigram']) / 10)
- sum_score['trigram'].append(sum(result[c]['trigram']) / 10)
- sum_score['classifier'].append(sum(result[c]['classifier']) / 10)
- sum_score['soundex'].append(sum(result[c]['soundex']) / 10)
- print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % (
- sum(sum_score['unigram']) / len(result.keys()) * 100,
- sum(sum_score['bigram']) / len(result.keys()) * 100,
- sum(sum_score['trigram']) / len(result.keys()) * 100,
- sum(sum_score['classifier']) / len(result.keys()) * 100,
- sum(sum_score['soundex']) / len(result.keys()) * 100)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement