Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import division
- import os.path
- import nltk
- from nltk.corpus.reader import XMLCorpusReader
- from nltk.model.ngram import NgramModel
- from nltk.probability import LidstoneProbDist
- import urllib2
- def train():
- if not os.path.isfile("medsamp2013.xml"):
- # download input XML file
- ifile = urllib2.urlopen("http://www.nlm.nih.gov/databases/dtd/medsamp2013.xml")
- ofile = open("medsamp2013.xml", 'wb')
- for line in ifile:
- ofile.write(line)
- ofile.close()
- reader = XMLCorpusReader(".", "medsamp2013.xml")
- # reconstitute list of words into list of sentences
- sentences = []
- sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
- words = [] # list of list
- for sentence in sentences:
- words.append(nltk.word_tokenize(sentence))
- # build trigram language model
- est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
- lm = NgramModel(3, words, estimator=est)
- # Alternatively, we can not set the estimator...
- # lm = NgramModel(3, words)
- return lm
- def test(lm):
- testSentences = [
- "Hearing loss is being partly or totally unable to hear sound in one or both ears.",
- "In law, a hearing is a proceeding before a court or other decision-making body or officer, such as a government agency."
- ]
- for sentence in testSentences:
- print "SENTENCE:", sentence,
- words = nltk.word_tokenize(sentence)
- slogprob = 0
- trigrams = nltk.trigrams(words)
- for trigram in trigrams:
- word = trigram[2]
- context = list(trigrams[:-1])
- slogprob += lm.logprob(word, context)
- print "(", slogprob, ")"
- def main():
- lm = train()
- test(lm)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement