Want more features on Pastebin? Sign Up, it's FREE!
Guest

Part-of-Speech tagging based on Soundex features.

By: a guest on Dec 18th, 2013  |  syntax: Python  |  size: 8.97 KB  |  views: 173  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
This paste has a previous version, view the difference. Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. Part-of-Speech tagging based on Soundex features. Take a look at python code below…
  2.  
  3. Source Code
  4. ===========
  5. import re
  6. import nltk
  7. from nltk.corpus import brown
  8. from nltk.tag.sequential import * def soundex(text):
  9.     first = text[0].upper()
  10.     text.lower()
  11.     text = re.sub('[bfpv]+','1',text)
  12.     text = re.sub('[cgjkqsxz]+','2',text)
  13.     text = re.sub('[dt]+','3',text)
  14.     text = re.sub('[l]+','4',text)
  15.     text = re.sub('[mn]+','5',text)
  16.     text = re.sub('[r]+','6',text)
  17.     for c in '123456':
  18.         text = re.sub(r'('+c+')([hw])('+c+')([1-6])',r'\1\2\4',text)
  19.     result = first + re.sub('[^0-9]+','',text[1:])
  20.     result = result[:4] + '0000'[:4-len(result[:4])]
  21.     return result class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
  22.     def feature_detector(self, tokens, index, history):
  23.         features = super(SoundexBasedPOSTagger, self).feature_detector(
  24.             tokens, index, history)
  25.         features['soundex'] = soundex(tokens[index])
  26.         return features brown_tagged_sents = brown.tagged_sents(categories='news')
  27. brown_sents = brown.sents(categories='news')
  28. size = int(len(brown_tagged_sents) * 0.9)
  29. train_sents = brown_tagged_sents[:size]
  30. test_sents = brown_tagged_sents[size:]
  31. soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
  32. print "Accuracy (with soundex):", soundex_tagger.evaluate(test_sents)
  33. tagger = ClassifierBasedPOSTagger(train=train_sents)
  34. print "Accuracy (without soundex):", tagger.evaluate(test_sents)
  35.  
  36. Result
  37. ======
  38. >>>
  39. ...
  40. Accuracy (with soundex): 0.889763779528
  41. Accuracy (without soundex): 0.887571015648  
  42.  
  43. As you can see in the evaluation above, it is a little bit better if you add soundex in a classification features set.
  44.  
  45. Reference
  46. =========
  47.  [1] http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html
  48.  [2] Kussell, R.C. (1918). United States patent Ie61167. Waahington, United States Patent Office.
  49.  
  50. Credit
  51. ======
  52. Kanokwut Thanadkarn
  53.  
  54. Appendix
  55. ========
  56. Ten-fold evaluation result on all brown corpus categories.
  57.  
  58.  ===========================================================
  59.  Category          Unigram Bigram  Trigram Classi. Soundex
  60.  mystery           85.0129 86.5653 86.4222 90.0662 90.3430
  61.  belles_lettres    86.2852 87.5612 87.5470 90.3835 90.5070
  62.  humor             80.1900 80.5058 80.4568 86.2681 86.6681
  63.  government        83.0616 85.0469 84.9953 86.8605 87.1182
  64.  fiction           84.9116 85.9910 85.9247 89.7749 90.1300
  65.  reviews           79.5728 80.3393 80.3386 86.2385 86.4775
  66.  religion          82.0312 82.9264 82.7888 86.5820 87.0057
  67.  romance           85.2516 86.6441 86.5653 90.0606 90.3025
  68.  science_fiction   81.3618 81.7111 81.5965 86.3424 86.9700
  69.  adventure         85.1166 86.4087 86.2206 90.2558 90.6666
  70.  editorial         82.6672 83.8054 83.6872 87.4071 87.7115
  71.  hobbies           82.2784 83.1417 83.0963 86.1888 86.4665
  72.  lore              84.4681 85.4592 85.4197 88.8270 89.0992
  73.  news              83.2413 84.2187 84.0862 88.2912 88.4107
  74.  learned           86.4339 87.5311 87.4997 88.9631 88.9764
  75.  Average Score (%) 83.4589 84.5237 84.4430 88.1673 88.4569
  76.  ===========================================================
  77.  
  78. [Source Code]
  79. import re
  80.  import nltk
  81.  import pickle
  82.  import os.path
  83.  from nltk.corpus import brown
  84.  from nltk.tag.sequential import *
  85.  
  86. def soundex(text):
  87.      if not text.isalpha():
  88.          return None
  89.      first = text[0].upper()
  90.      text.lower()
  91.      text = re.sub([bfpv]+’,1,text)
  92.      text = re.sub([cgjkqsxz]+’,2,text)
  93.      text = re.sub([dt]+’,3,text)
  94.      text = re.sub([l]+’,4,text)
  95.      text = re.sub([mn]+’,5,text)
  96.      text = re.sub([r]+’,6,text)
  97.      for c in123456′:
  98.          text = re.sub(r’(‘+c+’)([hw])(‘+c+’)([1-6]),r’\1\2\4,text)
  99.      result = first + re.sub([^0-9]+’,,text[1:])
  100.      result = result[:4] + ’0000[:4-len(result[:4])]
  101.      return result
  102.  
  103. class SoundexBasedPOSTagger(ClassifierBasedPOSTagger):
  104.      def feature_detector(self, tokens, index, history):
  105.          features = super(SoundexBasedPOSTagger, self).feature_detector(
  106.              tokens, index, history)
  107.          features['soundex'] = soundex(tokens[index])
  108.          return features
  109.  
  110. def kfold(data, r, k=10):
  111.      fsize = len(data) / float(k)
  112.      test_start = int(fsize * r)
  113.      test_end = int(fsize * (r+1))
  114.      test = [x for x in data[test_start:test_end]]
  115.      train = [x for i,x in enumerate(data) if
  116.                           i not in range(test_start, test_end)]
  117.      return train, test
  118.          
  119.  
  120. def evaluate(sents):
  121.      score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
  122.      for i in range(10):
  123.          train_sents, test_sents = kfold(sents, i)
  124.          if i == 0:
  125.              print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  126.          t0 = nltk.DefaultTagger(‘NN’)
  127.          t1 = nltk.UnigramTagger(train_sents, backoff=t0)
  128.          score['unigram'].append(t1.evaluate(test_sents))
  129.  
  130.         t2 = nltk.BigramTagger(train_sents, backoff=t1)
  131.          score['bigram'].append(t2.evaluate(test_sents))
  132.  
  133.         t3 = nltk.TrigramTagger(train_sents, backoff=t2)
  134.          score['trigram'].append(t3.evaluate(test_sents))
  135.  
  136.         classifier_tagger = ClassifierBasedPOSTagger(train=train_sents)
  137.          score['classifier'].append(classifier_tagger.evaluate(test_sents))
  138.  
  139.         soundex_tagger = SoundexBasedPOSTagger(train=train_sents)
  140.          score['soundex'].append(soundex_tagger.evaluate(test_sents))
  141.          print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
  142.                      score['unigram'][i],
  143.                      score['bigram'][i],
  144.                      score['trigram'][i],
  145.                      score['classifier'][i],
  146.                      score['soundex'][i])
  147.  
  148.     print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
  149.                  sum(score['unigram']) / 10,
  150.                  sum(score['bigram']) / 10,
  151.                  sum(score['trigram']) / 10,
  152.                  sum(score['classifier']) / 10,
  153.                  sum(score['soundex']) / 10)
  154.      return score
  155.  
  156. result = {}
  157.  skip_categories = {
  158.      ’adventure’:False, ’belles_lettres’:False, ’editorial’:False,’fiction’:False,
  159.      ’government’:False, ’hobbies’:False, ’humor’:False, ’learned’:False,
  160.      ’lore’:False, ’mystery’:False, ’news’:False, ’religion’:False,
  161.      ’reviews’:False, ’romance’:False, ’science_fiction’:False}
  162.  for c in brown.categories():
  163.      print ”category :”, c
  164.      if skip_categories[c]:
  165.          print ”*** skipped ***”
  166.          continue
  167.      obj_file = ”result.brown.%s” % c
  168.      if os.path.isfile(obj_file):
  169.          result[c] = pickle.load(file(obj_file, ’r'))
  170.         for i in range(10):
  171.             if i == 0:
  172.                 print ”Fold #\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  173.             print ”%d\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f” % (i,
  174.                     result[c]['unigram'][i],
  175.                     result[c]['bigram'][i],
  176.                     result[c]['trigram'][i],
  177.                     result[c]['classifier'][i],
  178.                     result[c]['soundex'][i])
  179.  
  180.        print ”Avg.\t%0.5f\t%0.5f\t%0.5f\t%0.5f\t%0.5f\n” % (
  181.                 sum(result[c]['unigram']) / 10,
  182.                 sum(result[c]['bigram']) / 10,
  183.                 sum(result[c]['trigram']) / 10,
  184.                 sum(result[c]['classifier']) / 10,
  185.                 sum(result[c]['soundex']) / 10)
  186.     else:
  187.         tagged_corpus = brown.tagged_sents(categories=c)
  188.         result[c] = evaluate(tagged_corpus)
  189.         pickle.dump(result[c], file(obj_file, ’w'))
  190.  sum_score = {‘unigram’:[], ’bigram’:[], ’trigram’:[], ’classifier’:[], ’soundex’:[]}
  191.  print ”Summary Score…”
  192.  print ”Category\t\t\tUnigram\tBigram\tTrigram\tClassi.\tSoundex”
  193.  for c in result.keys():
  194.      print ”%s\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f” % (c,
  195.                  sum(result[c]['unigram']) / 10 * 100,
  196.                  sum(result[c]['bigram']) / 10 * 100,
  197.                  sum(result[c]['trigram']) / 10 * 100,
  198.                  sum(result[c]['classifier']) / 10 * 100,
  199.                  sum(result[c]['soundex']) / 10 * 100)
  200.      sum_score['unigram'].append(sum(result[c]['unigram']) / 10)
  201.      sum_score['bigram'].append(sum(result[c]['bigram']) / 10)
  202.      sum_score['trigram'].append(sum(result[c]['trigram']) / 10)
  203.      sum_score['classifier'].append(sum(result[c]['classifier']) / 10)
  204.      sum_score['soundex'].append(sum(result[c]['soundex']) / 10)
  205.  
  206. print ”Average Score (%%)\t\t\t%2.4f\t%2.4f\t%2.4f\t%2.4f\t%2.4f\n” % (
  207.              sum(sum_score['unigram']) / len(result.keys()) * 100,
  208.              sum(sum_score['bigram']) / len(result.keys()) * 100,
  209.              sum(sum_score['trigram']) / len(result.keys()) * 100,
  210.              sum(sum_score['classifier']) / len(result.keys()) * 100,
  211.              sum(sum_score['soundex']) / len(result.keys()) * 100)
clone this paste RAW Paste Data