Advertisement
sujitpal

Demo Question Parsing problem on nltk-user

Apr 13th, 2013
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.82 KB | None | 0 0
  1. from __future__ import division
  2. import nltk
  3. import nltk.parse.pchart
  4.  
  5. def wrap(sentence):
  6.   return "(ROOT " + sentence + ")"
  7.  
  8. def unkize(words, rareWords):
  9.   return map(lambda word: "<UNK>" if word in rareWords else word, words)
  10.  
  11. # accumulate rare words
  12. wordsFD = nltk.FreqDist()
  13. ftrain = open("parse_train.dat", 'rb')
  14. for sentence in ftrain.readlines()[0:10]:
  15.   sentence = wrap(sentence.strip())
  16.   tree = nltk.Tree(sentence)
  17.   tree.collapse_unary(collapsePOS=True)
  18.   tree.chomsky_normal_form()
  19.   [wordsFD.inc(word) for word in tree.leaves()]
  20. ftrain.close()
  21. rareWords = set(filter(lambda word: wordsFD[word] < 3, wordsFD))
  22.  
  23. # build grammar
  24. productions = []
  25. ftrain = open("parse_train.dat", 'rb')
  26. for sentence in ftrain.readlines()[0:10]:
  27.   sentence = wrap(sentence.strip())
  28.   tree = nltk.Tree(sentence)
  29.   tree.collapse_unary(collapsePOS=True)
  30.   tree.chomsky_normal_form()
  31.   print "train sentence=", tree.leaves()
  32.   print "orig tree=", tree
  33.   for production in tree.productions():
  34.     if production.is_lexical() and len(production.rhs()) == 1:
  35.       termWord = production.rhs()[0]
  36.       if termWord in rareWords:
  37.         modProduction = nltk.Production(production.lhs(), ["<UNK>"])
  38.         productions.append(modProduction)
  39.         continue
  40.     productions.append(production)
  41. ftrain.close()
  42. root = nltk.Nonterminal("ROOT")
  43. grammar = nltk.induce_pcfg(root, productions)
  44. print grammar
  45.  
  46. parsers = [
  47.   nltk.ChartParser(grammar),
  48.   nltk.ViterbiParser(grammar)
  49. ]
  50. ftrain = open("parse_train.dat", 'rb')
  51. for sentence in ftrain.readlines()[0:10]:
  52.   sentence = wrap(sentence.strip())
  53.   tree = nltk.Tree(sentence)
  54.   words = unkize(tree.leaves(), rareWords)
  55.   print "test sentence: ", words
  56.   for parser in parsers:
  57.     print "parsed tree [ " + parser.__class__.__name__, \
  58.       "]:", parser.parse(words)
  59. ftrain.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement