Advertisement
Guest User

Untitled

a guest
Dec 11th, 2016
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.91 KB | None | 0 0
  1. """
  2. @author: Sourabh Garg
  3. """
  4. import os
  5. from cltk.corpus.utils.importer import CorpusImporter
  6. from cltk.tokenize.indian_tokenizer import *
  7. from cltk.utils.frequency import Frequency
  8. from cltk.stem.sanskrit.indian_syllabifier import Syllabifier
  9.  
  10.  
  11. #importing a sanskrit corpus
  12. c = CorpusImporter('sanskrit')
  13. print(c.list_corpora)
  14. c.import_corpus('sanskrit_text_sanskrit_documents')
  15.  
  16. #opening and reading a text from the corpus
  17. file = os.path.expanduser('C:/Users/user/cltk_data/sanskrit/text/sanskrit_text_sanskrit_documents/Puranas/bhagpur_cleaned.txt')
  18. with open(file,'rt',encoding='utf-8',errors='ignore') as f:
  19. r=f.read()
  20.  
  21. #tokenize using cltk
  22. x=indian_punctuation_tokenize_regex(r)
  23. print("number of total words",len(x))
  24.  
  25. #To find distinct words
  26. freq=Frequency()
  27. y=freq.counter_from_str(r)
  28. print("number of distinct words", len(y))
  29.  
  30. #lexical diversity=unique words/total words
  31. print("lexical diversity=",len(y)/len(x))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement