Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- @author: Sourabh Garg
- """
- import os
- from cltk.corpus.utils.importer import CorpusImporter
- from cltk.tokenize.indian_tokenizer import *
- from cltk.utils.frequency import Frequency
- from cltk.stem.sanskrit.indian_syllabifier import Syllabifier
- #importing a sanskrit corpus
- c = CorpusImporter('sanskrit')
- print(c.list_corpora)
- c.import_corpus('sanskrit_text_sanskrit_documents')
- #opening and reading a text from the corpus
- file = os.path.expanduser('C:/Users/user/cltk_data/sanskrit/text/sanskrit_text_sanskrit_documents/Puranas/bhagpur_cleaned.txt')
- with open(file,'rt',encoding='utf-8',errors='ignore') as f:
- r=f.read()
- #tokenize using cltk
- x=indian_punctuation_tokenize_regex(r)
- print("number of total words",len(x))
- #To find distinct words
- freq=Frequency()
- y=freq.counter_from_str(r)
- print("number of distinct words", len(y))
- #lexical diversity=unique words/total words
- print("lexical diversity=",len(y)/len(x))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement