Advertisement
sujitpal

NLTK NgramLangModel usage problem

Apr 24th, 2013
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.95 KB | None | 0 0
  1. from __future__ import division
  2.  
  3. import glob
  4. import nltk
  5. from nltk.corpus.reader import XMLCorpusReader
  6. from nltk.model.ngram import NgramModel
  7. from nltk.probability import LidstoneProbDist
  8. import cPickle
  9.  
  10. def train():
  11.   # parse XML and load up words
  12.   print("Loading words from XML files...")
  13.   sentences = []
  14.   files = glob.glob("data/*.xml")
  15.   i = 0
  16.   for file in files:
  17.     if i > 0 and i % 500 == 0:
  18.       print("%d/%d files loaded, #-sentences: %d" %
  19.         (i, len(files), len(sentences)))
  20.       break
  21.     dir, file = file.split("/")
  22.     reader = XMLCorpusReader(dir, file)
  23.     sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
  24.     i += 1
  25.   words = []
  26.   for sentence in sentences:
  27.     words.append(nltk.word_tokenize(sentence))
  28.   # build a trigram Language Model (using default Good-Turing
  29.   # smoothing) with the words array
  30.   print("Building language model...")
  31.   # Initial usage: instantiate LangModel with LidstoneProbDist
  32.   est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
  33.   langModel = NgramModel(3, words, estimator=est)
  34.   # ...not working... fall back to using default GoodTuringProbDist estimator
  35. #  langModel = NgramModel(3, words)
  36.   # ...drat! still not working!
  37.   # Try to pickle the langModel since we don't want to have to train it each time...
  38. #  cPickle.dump(langModel, open("lm.bin", 'wb'))
  39.   # ...not working, regardless of the estimator chosen
  40.  
  41.   return langModel
  42.  
  43. def test(langModel):
  44.   testData = open("sentences.test", 'rb')
  45.   for line in testData:
  46.     sentence = line.strip()
  47.     print "SENTENCE:", sentence,
  48.     words = nltk.word_tokenize(sentence)
  49.     trigrams = nltk.trigrams(words)
  50.     slogprob = 0
  51.     for trigram in trigrams:
  52.       word = trigram[2]
  53.       context = list(trigrams[:-1])
  54.       slogprob += langModel.logprob(word, context)
  55.     print "(", slogprob, ")"
  56.   testData.close()
  57.  
  58. def main():
  59.   langModel = train()
  60.   test(langModel)
  61.  
  62. if __name__ == "__main__":
  63.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement