Advertisement
Guest User

Untitled

a guest
Sep 17th, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.90 KB | None | 0 0
  1. import nltk
  2. from nltk.tokenize import word_tokenize #USE TO TOKENIZE TEXT.
  3. from nltk.text import Text
  4. import sys
  5. from nltk.tokenize import wordpunct_tokenize #TOKENIZE WITH splits text on whitespace and punctuation (except for underscore)
  6. string ="It is the branch of data science that consists of systematic processes for analyzing, understanding, and how to driving information from the text data in a smart and efficient manner."
  7. tokens = wordpunct_tokenize(string)
  8. print(tokens) #tokenization complete
  9.  
  10. # Exploring NLTK's stop words
  11. nltk.download('stopwords')
  12. from nltk.corpus import stopwords
  13. stopwords.readme().replace('\n', ' ') #we need to replace \n's with spaces for it to be readable.
  14.  
  15. #find stopword in english
  16. stopwords.words('english')[:5]
  17. stopwords.words('greek')[:5]
  18. stopwords.words('french')[:5]
  19. stopwords.words('german')[:5]
  20.  
  21. #count stop word use len()
  22. len(stopwords.words(['english']))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement