Advertisement
Guest User

Part-of-Speech tagging based on Soundex features

a guest
Dec 18th, 2013
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.90 KB | None | 0 0
  1. Part-of-Speech tagging based on Soundex features. Take a look at python code below…
  2.  
  3. [Source Code]
  4. import re
  5. import nltk
  6. from nltk.corpus import brown
  7. from nltk.tag.sequential import * def soundex(text):
  8.     first = text[0].upper()
  9.     text.lower()
  10.     text = re.sub('[bfpv]+','1',text)
  11.     text = re.sub('[cgjkqsxz]+','2',text)
  12.     text = re.sub('[dt]+','3',text)
  13.     text = re.sub('[l]+','4',text)
  14.     text = re.sub('[mn]+','5',text)
  15.     text = re.sub('[r]+','6',text)
  16.     for c in '123456':
  17.         text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text)
  18.     result = first + re.sub('[^0-9]+','',text[1:])
  19.     result = result[:4] + '0000'[:4-len(result[:4])]
  20.     return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
  21.     def feature_detector(self, tokens, index, history):
  22.         features = super(SoundexBasedPOSTagger, self).feature_detector(
  23.             tokens, index, history)
  24.         features['soundex'] = soundex(tokens[index])
  25.         return features brown_tagged_sents = brown.tagged_sents(categories='news')
  26. brown_sents = brown.sents(categories='news')
  27. size = int(len(brown_tagged_sents) * 0.9)
  28. train_sents = brown_tagged_sents[:size]
  29. test_sents = brown_tagged_sents[size:]
  30. soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
  31. print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents)
  32. tagger = ClassifierBasedPOSTagger(train=train_sents)
  33. print "Accuracy (without soundex):", tagger.evaluate(test_sents)
  34.  
  35. >>>
  36. ...
  37. Accuracy (with soundex): 0.889763779528
  38. Accuracy (without soundex): 0.887571015648  
  39.  
  40. As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set.
  41.  
  42. Reference:
  43.  [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html
  44.  [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office.
  45.  
  46. Appendix:
  47. Ten-fold evaluation result on all brown corpus categories.
  48.  
  49.  ===========================================================
  50.  Category          Unigram Bigram  Trigram Classi. Soundex
  51.  mystery           85.0129 86.5653 86.4222 90.0662 90.3430
  52.  belles_lettres    86.2852 87.5612 87.5470 90.3835 90.5070
  53.  humor             80.1900 80.5058 80.4568 86.2681 86.6681
  54.  government        83.0616 85.0469 84.9953 86.8605 87.1182
  55.  fiction           84.9116 85.9910 85.9247 89.7749 90.1300
  56.  reviews           79.5728 80.3393 80.3386 86.2385 86.4775
  57.  religion          82.0312 82.9264 82.7888 86.5820 87.0057
  58.  romance           85.2516 86.6441 86.5653 90.0606 90.3025
  59.  science_fiction   81.3618 81.7111 81.5965 86.3424 86.9700
  60.  adventure         85.1166 86.4087 86.2206 90.2558 90.6666
  61.  editorial         82.6672 83.8054 83.6872 87.4071 87.7115
  62.  hobbies           82.2784 83.1417 83.0963 86.1888 86.4665
  63.  lore              84.4681 85.4592 85.4197 88.8270 89.0992
  64.  news              83.2413 84.2187 84.0862 88.2912 88.4107
  65.  learned           86.4339 87.5311 87.4997 88.9631 88.9764
  66.  Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569
  67.  ===========================================================
  68.  
  69. [Source Code]
  70. import re
  71.  import nltk
  72.  import pickle
  73.  import os.path
  74.  from nltk.corpus import brown
  75.  from nltk.tag.sequential import *
  76.  
  77. def soundex(text):
  78.      if not text.isalpha():
  79.          return None
  80.      first = text[0].upper()
  81.      text.lower()
  82.      text = re.sub([bfpv]+’,1,text)
  83.      text = re.sub([cgjkqsxz]+’,2,text)
  84.      text = re.sub([dt]+’,3,text)
  85.      text = re.sub([l]+’,4,text)
  86.      text = re.sub([mn]+’,5,text)
  87.      text = re.sub([r]+’,6,text)
  88.      for c in123456′:
  89.          text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6]),r’\1\2\4,text)
  90.      result = first + re.sub([^0-9]+’,,text[1:])
  91.      result = result[:4] + ’0000[:4-len(result[:4])]
  92.      return result
  93.  
  94. class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
  95.      def feature_detector(self, tokens, index, history):
  96.          features = super(SoundexBasedPOSTagger, self).feature_detector(
  97.              tokens, index, history)
  98.          features['soundex'] = soundex(tokens[index])
  99.          return features
  100.  
  101. def kfold(data, r, k=10):
  102.      fsize = len(data) / float(k)
  103.      test_start = int(fsize * r)
  104.      test_end = int(fsize * (r+1))
  105.      test = [x for x in data[test_start:test_end]]
  106.      train = [x for i,x in enumerate(data) if
  107.                           i not in range(test_start, test_end)]
  108.      return train, test
  109.          
  110.  
  111. def evaluate(sents):
  112.      score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
  113.      for i in range(10):
  114.          train_sents, test_sents = kfold(sents, i)
  115.          if i == 0:
  116.              print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  117.          t0 = nltk.DefaultTagger(‘NN’)
  118.          t1 = nltk.UnigramTagger(train_sents, backoff=t0)
  119.          score['unigram'].append(t1.evaluate(test_sents))
  120.  
  121.         t2 = nltk.BigramTagger(train_sents, backoff=t1)
  122.          score['bigram'].append(t2.evaluate(test_sents))
  123.  
  124.         t3 = nltk.TrigramTagger(train_sents, backoff=t2)
  125.          score['trigram'].append(t3.evaluate(test_sents))
  126.  
  127.         classifier_tagger = ClassifierBasedPOSTagger(train=train_sents)
  128.          score['classifier'].append(classifier_tagger.evaluate(test_sents))
  129.  
  130.         soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
  131.          score['soundex'].append(soundex_tagger.evaluate(test_sents))
  132.          print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
  133.                      score['unigram'][i],
  134.                      score['bigram'][i],
  135.                      score['trigram'][i],
  136.                      score['classifier'][i],
  137.                      score['soundex'][i])
  138.  
  139.     print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
  140.                  sum(score['unigram']) / 10,
  141.                  sum(score['bigram']) / 10,
  142.                  sum(score['trigram']) / 10,
  143.                  sum(score['classifier']) / 10,
  144.                  sum(score['soundex']) / 10)
  145.      return score
  146.  
  147. result = {}
  148.  skip_categories = {
  149.      ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False,
  150.      ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False,
  151.      ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False,
  152.      ’reviews’:False, ’romance’:False, ’science_fiction’:False}
  153.  for c in brown.categories():
  154.      print ”category :”, c
  155.      if skip_categories[c]:
  156.          print ”*** skipped ***”
  157.          continue
  158.      obj_file = ”result.brown.%s” % c
  159.      if os.path.isfile(obj_file):
  160.          result[c] = pickle.load(file(obj_file, ’r'))
  161.         for i in range(10):
  162.             if i == 0:
  163.                 print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  164.             print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
  165.                     result[c]['unigram'][i],
  166.                     result[c]['bigram'][i],
  167.                     result[c]['trigram'][i],
  168.                     result[c]['classifier'][i],
  169.                     result[c]['soundex'][i])
  170.  
  171.        print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
  172.                 sum(result[c]['unigram']) / 10,
  173.                 sum(result[c]['bigram']) / 10,
  174.                 sum(result[c]['trigram']) / 10,
  175.                 sum(result[c]['classifier']) / 10,
  176.                 sum(result[c]['soundex']) / 10)
  177.     else:
  178.         tagged_corpus = brown.tagged_sents(categories=c)
  179.         result[c] = evaluate(tagged_corpus)
  180.         pickle.dump(result[c], file(obj_file, ’w'))
  181.  sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
  182.  print ”Summary Score…”
  183.  print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  184.  for c in result.keys():
  185.      print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c,
  186.                  sum(result[c]['unigram']) / 10 * 100,
  187.                  sum(result[c]['bigram']) / 10 * 100,
  188.                  sum(result[c]['trigram']) / 10 * 100,
  189.                  sum(result[c]['classifier']) / 10 * 100,
  190.                  sum(result[c]['soundex']) / 10 * 100)
  191.      sum_score['unigram'].append(sum(result[c]['unigram']) / 10)
  192.      sum_score['bigram'].append(sum(result[c]['bigram']) / 10)
  193.      sum_score['trigram'].append(sum(result[c]['trigram']) / 10)
  194.      sum_score['classifier'].append(sum(result[c]['classifier']) / 10)
  195.      sum_score['soundex'].append(sum(result[c]['soundex']) / 10)
  196.  
  197. print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % (
  198.              sum(sum_score['unigram']) / len(result.keys()) * 100,
  199.              sum(sum_score['bigram']) / len(result.keys()) * 100,
  200.              sum(sum_score['trigram']) / len(result.keys()) * 100,
  201.              sum(sum_score['classifier']) / len(result.keys()) * 100,
  202.              sum(sum_score['soundex']) / len(result.keys()) * 100)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement