Advertisement
Guest User

Untitled

a guest
Jun 18th, 2019
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.81 KB | None | 0 0
  1. import nltk
  2. import math
  3. from nltk.tokenize import word_tokenize
  4. from nltk.probability import FreqDist
  5. # nltk.download('brown')
  6. # nltk.download('nonbreaking_prefixes')
  7. # nltk.download('perluniprops')
  8. def calc_perplexity(bigram_prob,length):
  9. log_total=0
  10. for i in bigram_prob:
  11. log_total=log_total+math.log2(bigram_prob[i])
  12. log_total=log_total/length
  13. return 2 ** (-1*log_total)
  14.  
  15. f=open("Cow.txt" , "r")
  16. contents=f.read()
  17. contents=contents.lower()
  18. text=word_tokenize(contents)
  19. bigrams=list(nltk.bigrams(text))
  20. firstelement=('.' , text[0])
  21. bigrams.insert(0,firstelement)
  22. text.insert(0,'.')
  23. print(bigrams)
  24. fdist1 = FreqDist(bigrams)
  25. fdist2 = FreqDist(text)
  26. bigram_prob={}
  27. for i in bigrams:
  28. bigram_prob[i]=fdist1[i]/fdist2[i[0]]
  29. perplexity=calc_perplexity(bigram_prob,len(text))
  30. print(perplexity)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement