Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- import nltk
- import nltk.parse.pchart
- def wrap(sentence):
- return "(ROOT " + sentence + ")"
- def unkize(words, rareWords):
- return map(lambda word: "<UNK>" if word in rareWords else word, words)
- # accumulate rare words
- wordsFD = nltk.FreqDist()
- ftrain = open("parse_train.dat", 'rb')
- for sentence in ftrain.readlines()[0:10]:
- sentence = wrap(sentence.strip())
- tree = nltk.Tree(sentence)
- tree.collapse_unary(collapsePOS=True)
- tree.chomsky_normal_form()
- [wordsFD.inc(word) for word in tree.leaves()]
- ftrain.close()
- rareWords = set(filter(lambda word: wordsFD[word] < 3, wordsFD))
- # build grammar
- productions = []
- ftrain = open("parse_train.dat", 'rb')
- for sentence in ftrain.readlines()[0:10]:
- sentence = wrap(sentence.strip())
- tree = nltk.Tree(sentence)
- tree.collapse_unary(collapsePOS=True)
- tree.chomsky_normal_form()
- print "train sentence=", tree.leaves()
- print "orig tree=", tree
- for production in tree.productions():
- if production.is_lexical() and len(production.rhs()) == 1:
- termWord = production.rhs()[0]
- if termWord in rareWords:
- modProduction = nltk.Production(production.lhs(), ["<UNK>"])
- productions.append(modProduction)
- continue
- productions.append(production)
- ftrain.close()
- root = nltk.Nonterminal("ROOT")
- grammar = nltk.induce_pcfg(root, productions)
- print grammar
- parsers = [
- nltk.ChartParser(grammar),
- nltk.ViterbiParser(grammar)
- ]
- ftrain = open("parse_train.dat", 'rb')
- for sentence in ftrain.readlines()[0:10]:
- sentence = wrap(sentence.strip())
- tree = nltk.Tree(sentence)
- words = unkize(tree.leaves(), rareWords)
- print "test sentence: ", words
- for parser in parsers:
- print "parsed tree [ " + parser.__class__.__name__, \
- "]:", parser.parse(words)
- ftrain.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement