Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- import math
- from nltk.tokenize import word_tokenize
- from nltk.probability import FreqDist
- # nltk.download('brown')
- # nltk.download('nonbreaking_prefixes')
- # nltk.download('perluniprops')
- def calc_perplexity(bigram_prob,length):
- log_total=0
- for i in bigram_prob:
- log_total=log_total+math.log2(bigram_prob[i])
- log_total=log_total/length
- return 2 ** (-1*log_total)
- f=open("Cow.txt" , "r")
- contents=f.read()
- contents=contents.lower()
- text=word_tokenize(contents)
- bigrams=list(nltk.bigrams(text))
- firstelement=('.' , text[0])
- bigrams.insert(0,firstelement)
- text.insert(0,'.')
- print(bigrams)
- fdist1 = FreqDist(bigrams)
- fdist2 = FreqDist(text)
- bigram_prob={}
- for i in bigrams:
- bigram_prob[i]=fdist1[i]/fdist2[i[0]]
- perplexity=calc_perplexity(bigram_prob,len(text))
- print(perplexity)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement