View difference between Paste ID: UaT0AneH and 0PmPFmpT
SHOW: | | - or go back to the newest paste.
1
Part-of-Speech tagging based on Soundex features. Take a look at python code below…
2
3
Source Code
4
===========
5
import re
6
import nltk
7
from nltk.corpus import brown
8
from nltk.tag.sequential import * def soundex(text):
9
    first = text[0].upper()
10
    text.lower()
11
    text = re.sub('[bfpv]+','1',text)
12
    text = re.sub('[cgjkqsxz]+','2',text)
13
    text = re.sub('[dt]+','3',text)
14
    text = re.sub('[l]+','4',text)
15
    text = re.sub('[mn]+','5',text)
16
    text = re.sub('[r]+','6',text)
17
    for c in '123456':
18
        text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text)
19
    result = first + re.sub('[^0-9]+','',text[1:])
20
    result = result[:4] + '0000'[:4-len(result[:4])]
21
    return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
22
    def feature_detector(self, tokens, index, history):
23
        features = super(SoundexBasedPOSTagger, self).feature_detector(
24
            tokens, index, history)
25
        features['soundex'] = soundex(tokens[index])
26
        return features brown_tagged_sents = brown.tagged_sents(categories='news')
27
brown_sents = brown.sents(categories='news')
28
size = int(len(brown_tagged_sents) * 0.9)
29
train_sents = brown_tagged_sents[:size]
30
test_sents = brown_tagged_sents[size:]
31
soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
32
print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents)
33
tagger = ClassifierBasedPOSTagger(train=train_sents)
34
print "Accuracy (without soundex):", tagger.evaluate(test_sents) 
35
36
Result
37
======
38
>>>
39
...
40
Accuracy (with soundex): 0.889763779528
41
Accuracy (without soundex): 0.887571015648  
42-
Reference:
42+
43
As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set. 
44
45
Reference
46-
Appendix:
46+
=========
47
 [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html
48
 [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office.
49
50
Credit
51
======
52
Kanokwut Thanadkarn
53
54
Appendix
55
========
56
Ten-fold evaluation result on all brown corpus categories. 
57
58
 ===========================================================
59
 Category          Unigram Bigram  Trigram Classi. Soundex
60
 mystery           85.0129 86.5653 86.4222 90.0662 90.3430
61
 belles_lettres    86.2852 87.5612 87.5470 90.3835 90.5070
62
 humor             80.1900 80.5058 80.4568 86.2681 86.6681
63
 government        83.0616 85.0469 84.9953 86.8605 87.1182
64
 fiction           84.9116 85.9910 85.9247 89.7749 90.1300
65
 reviews           79.5728 80.3393 80.3386 86.2385 86.4775
66
 religion          82.0312 82.9264 82.7888 86.5820 87.0057
67
 romance           85.2516 86.6441 86.5653 90.0606 90.3025
68
 science_fiction   81.3618 81.7111 81.5965 86.3424 86.9700
69
 adventure         85.1166 86.4087 86.2206 90.2558 90.6666
70
 editorial         82.6672 83.8054 83.6872 87.4071 87.7115
71
 hobbies           82.2784 83.1417 83.0963 86.1888 86.4665
72
 lore              84.4681 85.4592 85.4197 88.8270 89.0992
73
 news              83.2413 84.2187 84.0862 88.2912 88.4107
74
 learned           86.4339 87.5311 87.4997 88.9631 88.9764
75
 Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569
76
 ===========================================================
77
78
[Source Code]
79
import re
80
 import nltk
81
 import pickle
82
 import os.path
83
 from nltk.corpus import brown
84
 from nltk.tag.sequential import *
85
 
86
def soundex(text):
87
     if not text.isalpha():
88
         return None
89
     first = text[0].upper()
90
     text.lower()
91
     text = re.sub(‘[bfpv]+’,’1′,text)
92
     text = re.sub(‘[cgjkqsxz]+’,’2′,text)
93
     text = re.sub(‘[dt]+’,’3′,text)
94
     text = re.sub(‘[l]+’,’4′,text)
95
     text = re.sub(‘[mn]+’,’5′,text)
96
     text = re.sub(‘[r]+’,’6′,text)
97
     for c in ’123456′:
98
         text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6])’,r’\1\2\4′,text)
99
     result = first + re.sub(‘[^0-9]+’,”,text[1:])
100
     result = result[:4] + ’0000′[:4-len(result[:4])]
101
     return result
102
 
103
class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
104
     def feature_detector(self, tokens, index, history):
105
         features = super(SoundexBasedPOSTagger, self).feature_detector(
106
             tokens, index, history)
107
         features['soundex'] = soundex(tokens[index])
108
         return features
109
 
110
def kfold(data, r, k=10):
111
     fsize = len(data) / float(k)
112
     test_start = int(fsize * r)
113
     test_end = int(fsize * (r+1))
114
     test = [x for x in data[test_start:test_end]]
115
     train = [x for i,x in enumerate(data) if
116
                          i not in range(test_start, test_end)]
117
     return train, test
118
         
119
 
120
def evaluate(sents):
121
     score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
122
     for i in range(10):
123
         train_sents, test_sents = kfold(sents, i)
124
         if i == 0:
125
             print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
126
         t0 = nltk.DefaultTagger(‘NN’)
127
         t1 = nltk.UnigramTagger(train_sents, backoff=t0)
128
         score['unigram'].append(t1.evaluate(test_sents))
129
 
130
        t2 = nltk.BigramTagger(train_sents, backoff=t1)
131
         score['bigram'].append(t2.evaluate(test_sents))
132
 
133
        t3 = nltk.TrigramTagger(train_sents, backoff=t2)
134
         score['trigram'].append(t3.evaluate(test_sents))
135
 
136
        classifier_tagger = ClassifierBasedPOSTagger(train=train_sents)
137
         score['classifier'].append(classifier_tagger.evaluate(test_sents))
138
 
139
        soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
140
         score['soundex'].append(soundex_tagger.evaluate(test_sents))
141
         print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
142
                     score['unigram'][i],
143
                     score['bigram'][i],
144
                     score['trigram'][i],
145
                     score['classifier'][i],
146
                     score['soundex'][i])
147
 
148
    print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
149
                 sum(score['unigram']) / 10,
150
                 sum(score['bigram']) / 10,
151
                 sum(score['trigram']) / 10,
152
                 sum(score['classifier']) / 10,
153
                 sum(score['soundex']) / 10)
154
     return score
155
 
156
result = {}
157
 skip_categories = {
158
     ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False,
159
     ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False,
160
     ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False,
161
     ’reviews’:False, ’romance’:False, ’science_fiction’:False}
162
 for c in brown.categories():
163
     print ”category :”, c
164
     if skip_categories[c]:
165
         print ”*** skipped ***”
166
         continue
167
     obj_file = ”result.brown.%s” % c
168
     if os.path.isfile(obj_file):
169
         result[c] = pickle.load(file(obj_file, ’r'))
170
         for i in range(10):
171
             if i == 0:
172
                 print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
173
             print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
174
                     result[c]['unigram'][i],
175
                     result[c]['bigram'][i],
176
                     result[c]['trigram'][i],
177
                     result[c]['classifier'][i],
178
                     result[c]['soundex'][i])
179
 
180
        print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
181
                 sum(result[c]['unigram']) / 10,
182
                 sum(result[c]['bigram']) / 10,
183
                 sum(result[c]['trigram']) / 10,
184
                 sum(result[c]['classifier']) / 10,
185
                 sum(result[c]['soundex']) / 10)
186
     else:
187
         tagged_corpus = brown.tagged_sents(categories=c)
188
         result[c] = evaluate(tagged_corpus)
189
         pickle.dump(result[c], file(obj_file, ’w'))
190
 sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
191
 print ”Summary Score…”
192
 print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
193
 for c in result.keys():
194
     print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c,
195
                 sum(result[c]['unigram']) / 10 * 100,
196
                 sum(result[c]['bigram']) / 10 * 100,
197
                 sum(result[c]['trigram']) / 10 * 100,
198
                 sum(result[c]['classifier']) / 10 * 100,
199
                 sum(result[c]['soundex']) / 10 * 100)
200
     sum_score['unigram'].append(sum(result[c]['unigram']) / 10)
201
     sum_score['bigram'].append(sum(result[c]['bigram']) / 10)
202
     sum_score['trigram'].append(sum(result[c]['trigram']) / 10)
203
     sum_score['classifier'].append(sum(result[c]['classifier']) / 10)
204
     sum_score['soundex'].append(sum(result[c]['soundex']) / 10)
205
 
206
print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % (
207
             sum(sum_score['unigram']) / len(result.keys()) * 100,
208
             sum(sum_score['bigram']) / len(result.keys()) * 100,
209
             sum(sum_score['trigram']) / len(result.keys()) * 100,
210
             sum(sum_score['classifier']) / len(result.keys()) * 100,
211
             sum(sum_score['soundex']) / len(result.keys()) * 100)