Advertisement
sujitpal

NgramModel Problem Demo Take 2

Apr 24th, 2013
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.67 KB | None | 0 0
  1. from __future__ import division
  2.  
  3. import os.path
  4.  
  5. import nltk
  6. from nltk.corpus.reader import XMLCorpusReader
  7. from nltk.model.ngram import NgramModel
  8. from nltk.probability import LidstoneProbDist
  9. import urllib2
  10.  
  11. def train():
  12.   if not os.path.isfile("medsamp2013.xml"):
  13.     # download input XML file
  14.     ifile = urllib2.urlopen("http://www.nlm.nih.gov/databases/dtd/medsamp2013.xml")
  15.     ofile = open("medsamp2013.xml", 'wb')
  16.     for line in ifile:
  17.       ofile.write(line)
  18.     ofile.close()
  19.   reader = XMLCorpusReader(".", "medsamp2013.xml")
  20.   # reconstitute list of words into list of sentences
  21.   sentences = []
  22.   sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
  23.   words = [] # list of list
  24.   for sentence in sentences:
  25.     words.append(nltk.word_tokenize(sentence))
  26.   # build trigram language model
  27.   est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
  28.   lm = NgramModel(3, words, estimator=est)
  29.   # Alternatively, we can not set the estimator...
  30. #  lm = NgramModel(3, words)
  31.   return lm
  32.  
  33. def test(lm):
  34.   testSentences = [
  35.     "Hearing loss is being partly or totally unable to hear sound in one or both ears.",
  36.     "In law, a hearing is a proceeding before a court or other decision-making body or officer, such as a government agency."
  37.   ]
  38.   for sentence in testSentences:
  39.     print "SENTENCE:", sentence,
  40.     words = nltk.word_tokenize(sentence)
  41.     slogprob = 0
  42.     trigrams = nltk.trigrams(words)
  43.     for trigram in trigrams:
  44.       word = trigram[2]
  45.       context = list(trigrams[:-1])
  46.       slogprob += lm.logprob(word, context)
  47.     print "(", slogprob, ")"
  48.  
  49. def main():
  50.   lm = train()
  51.   test(lm)
  52.  
  53. if __name__ == "__main__":
  54.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement