SHOW:
|
|
- or go back to the newest paste.
1 | Part-of-Speech tagging based on Soundex features. Take a look at python code below… | |
2 | ||
3 | Source Code | |
4 | =========== | |
5 | import re | |
6 | import nltk | |
7 | from nltk.corpus import brown | |
8 | from nltk.tag.sequential import * def soundex(text): | |
9 | first = text[0].upper() | |
10 | text.lower() | |
11 | text = re.sub('[bfpv]+','1',text) | |
12 | text = re.sub('[cgjkqsxz]+','2',text) | |
13 | text = re.sub('[dt]+','3',text) | |
14 | text = re.sub('[l]+','4',text) | |
15 | text = re.sub('[mn]+','5',text) | |
16 | text = re.sub('[r]+','6',text) | |
17 | for c in '123456': | |
18 | text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text) | |
19 | result = first + re.sub('[^0-9]+','',text[1:]) | |
20 | result = result[:4] + '0000'[:4-len(result[:4])] | |
21 | return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger): | |
22 | def feature_detector(self, tokens, index, history): | |
23 | features = super(SoundexBasedPOSTagger, self).feature_detector( | |
24 | tokens, index, history) | |
25 | features['soundex'] = soundex(tokens[index]) | |
26 | return features brown_tagged_sents = brown.tagged_sents(categories='news') | |
27 | brown_sents = brown.sents(categories='news') | |
28 | size = int(len(brown_tagged_sents) * 0.9) | |
29 | train_sents = brown_tagged_sents[:size] | |
30 | test_sents = brown_tagged_sents[size:] | |
31 | soundex_tagger = SoundexBasedPOSTagger(train=train_sents) | |
32 | print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents) | |
33 | tagger = ClassifierBasedPOSTagger(train=train_sents) | |
34 | print "Accuracy (without soundex):", tagger.evaluate(test_sents) | |
35 | ||
36 | Result | |
37 | ====== | |
38 | >>> | |
39 | ... | |
40 | Accuracy (with soundex): 0.889763779528 | |
41 | Accuracy (without soundex): 0.887571015648 | |
42 | - | Reference: |
42 | + | |
43 | As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set. | |
44 | ||
45 | Reference | |
46 | - | Appendix: |
46 | + | ========= |
47 | [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html | |
48 | [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office. | |
49 | ||
50 | Credit | |
51 | ====== | |
52 | Kanokwut Thanadkarn | |
53 | ||
54 | Appendix | |
55 | ======== | |
56 | Ten-fold evaluation result on all brown corpus categories. | |
57 | ||
58 | =========================================================== | |
59 | Category Unigram Bigram Trigram Classi. Soundex | |
60 | mystery 85.0129 86.5653 86.4222 90.0662 90.3430 | |
61 | belles_lettres 86.2852 87.5612 87.5470 90.3835 90.5070 | |
62 | humor 80.1900 80.5058 80.4568 86.2681 86.6681 | |
63 | government 83.0616 85.0469 84.9953 86.8605 87.1182 | |
64 | fiction 84.9116 85.9910 85.9247 89.7749 90.1300 | |
65 | reviews 79.5728 80.3393 80.3386 86.2385 86.4775 | |
66 | religion 82.0312 82.9264 82.7888 86.5820 87.0057 | |
67 | romance 85.2516 86.6441 86.5653 90.0606 90.3025 | |
68 | science_fiction 81.3618 81.7111 81.5965 86.3424 86.9700 | |
69 | adventure 85.1166 86.4087 86.2206 90.2558 90.6666 | |
70 | editorial 82.6672 83.8054 83.6872 87.4071 87.7115 | |
71 | hobbies 82.2784 83.1417 83.0963 86.1888 86.4665 | |
72 | lore 84.4681 85.4592 85.4197 88.8270 89.0992 | |
73 | news 83.2413 84.2187 84.0862 88.2912 88.4107 | |
74 | learned 86.4339 87.5311 87.4997 88.9631 88.9764 | |
75 | Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569 | |
76 | =========================================================== | |
77 | ||
78 | [Source Code] | |
79 | import re | |
80 | import nltk | |
81 | import pickle | |
82 | import os.path | |
83 | from nltk.corpus import brown | |
84 | from nltk.tag.sequential import * | |
85 | ||
86 | def soundex(text): | |
87 | if not text.isalpha(): | |
88 | return None | |
89 | first = text[0].upper() | |
90 | text.lower() | |
91 | text = re.sub(‘[bfpv]+’,’1′,text) | |
92 | text = re.sub(‘[cgjkqsxz]+’,’2′,text) | |
93 | text = re.sub(‘[dt]+’,’3′,text) | |
94 | text = re.sub(‘[l]+’,’4′,text) | |
95 | text = re.sub(‘[mn]+’,’5′,text) | |
96 | text = re.sub(‘[r]+’,’6′,text) | |
97 | for c in ’123456′: | |
98 | text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6])’,r’\1\2\4′,text) | |
99 | result = first + re.sub(‘[^0-9]+’,”,text[1:]) | |
100 | result = result[:4] + ’0000′[:4-len(result[:4])] | |
101 | return result | |
102 | ||
103 | class SoundexBasedPOSTagger(ClassifierBasedPOSTagger): | |
104 | def feature_detector(self, tokens, index, history): | |
105 | features = super(SoundexBasedPOSTagger, self).feature_detector( | |
106 | tokens, index, history) | |
107 | features['soundex'] = soundex(tokens[index]) | |
108 | return features | |
109 | ||
110 | def kfold(data, r, k=10): | |
111 | fsize = len(data) / float(k) | |
112 | test_start = int(fsize * r) | |
113 | test_end = int(fsize * (r+1)) | |
114 | test = [x for x in data[test_start:test_end]] | |
115 | train = [x for i,x in enumerate(data) if | |
116 | i not in range(test_start, test_end)] | |
117 | return train, test | |
118 | ||
119 | ||
120 | def evaluate(sents): | |
121 | score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]} | |
122 | for i in range(10): | |
123 | train_sents, test_sents = kfold(sents, i) | |
124 | if i == 0: | |
125 | print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” | |
126 | t0 = nltk.DefaultTagger(‘NN’) | |
127 | t1 = nltk.UnigramTagger(train_sents, backoff=t0) | |
128 | score['unigram'].append(t1.evaluate(test_sents)) | |
129 | ||
130 | t2 = nltk.BigramTagger(train_sents, backoff=t1) | |
131 | score['bigram'].append(t2.evaluate(test_sents)) | |
132 | ||
133 | t3 = nltk.TrigramTagger(train_sents, backoff=t2) | |
134 | score['trigram'].append(t3.evaluate(test_sents)) | |
135 | ||
136 | classifier_tagger = ClassifierBasedPOSTagger(train=train_sents) | |
137 | score['classifier'].append(classifier_tagger.evaluate(test_sents)) | |
138 | ||
139 | soundex_tagger = SoundexBasedPOSTagger(train=train_sents) | |
140 | score['soundex'].append(soundex_tagger.evaluate(test_sents)) | |
141 | print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i, | |
142 | score['unigram'][i], | |
143 | score['bigram'][i], | |
144 | score['trigram'][i], | |
145 | score['classifier'][i], | |
146 | score['soundex'][i]) | |
147 | ||
148 | print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % ( | |
149 | sum(score['unigram']) / 10, | |
150 | sum(score['bigram']) / 10, | |
151 | sum(score['trigram']) / 10, | |
152 | sum(score['classifier']) / 10, | |
153 | sum(score['soundex']) / 10) | |
154 | return score | |
155 | ||
156 | result = {} | |
157 | skip_categories = { | |
158 | ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False, | |
159 | ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False, | |
160 | ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False, | |
161 | ’reviews’:False, ’romance’:False, ’science_fiction’:False} | |
162 | for c in brown.categories(): | |
163 | print ”category :”, c | |
164 | if skip_categories[c]: | |
165 | print ”*** skipped ***” | |
166 | continue | |
167 | obj_file = ”result.brown.%s” % c | |
168 | if os.path.isfile(obj_file): | |
169 | result[c] = pickle.load(file(obj_file, ’r')) | |
170 | for i in range(10): | |
171 | if i == 0: | |
172 | print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” | |
173 | print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i, | |
174 | result[c]['unigram'][i], | |
175 | result[c]['bigram'][i], | |
176 | result[c]['trigram'][i], | |
177 | result[c]['classifier'][i], | |
178 | result[c]['soundex'][i]) | |
179 | ||
180 | print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % ( | |
181 | sum(result[c]['unigram']) / 10, | |
182 | sum(result[c]['bigram']) / 10, | |
183 | sum(result[c]['trigram']) / 10, | |
184 | sum(result[c]['classifier']) / 10, | |
185 | sum(result[c]['soundex']) / 10) | |
186 | else: | |
187 | tagged_corpus = brown.tagged_sents(categories=c) | |
188 | result[c] = evaluate(tagged_corpus) | |
189 | pickle.dump(result[c], file(obj_file, ’w')) | |
190 | sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]} | |
191 | print ”Summary Score…” | |
192 | print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex” | |
193 | for c in result.keys(): | |
194 | print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c, | |
195 | sum(result[c]['unigram']) / 10 * 100, | |
196 | sum(result[c]['bigram']) / 10 * 100, | |
197 | sum(result[c]['trigram']) / 10 * 100, | |
198 | sum(result[c]['classifier']) / 10 * 100, | |
199 | sum(result[c]['soundex']) / 10 * 100) | |
200 | sum_score['unigram'].append(sum(result[c]['unigram']) / 10) | |
201 | sum_score['bigram'].append(sum(result[c]['bigram']) / 10) | |
202 | sum_score['trigram'].append(sum(result[c]['trigram']) / 10) | |
203 | sum_score['classifier'].append(sum(result[c]['classifier']) / 10) | |
204 | sum_score['soundex'].append(sum(result[c]['soundex']) / 10) | |
205 | ||
206 | print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % ( | |
207 | sum(sum_score['unigram']) / len(result.keys()) * 100, | |
208 | sum(sum_score['bigram']) / len(result.keys()) * 100, | |
209 | sum(sum_score['trigram']) / len(result.keys()) * 100, | |
210 | sum(sum_score['classifier']) / len(result.keys()) * 100, | |
211 | sum(sum_score['soundex']) / len(result.keys()) * 100) |