Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.tokenize import word_tokenize #USE TO TOKENIZE TEXT.
- from nltk.text import Text
- import sys
- from nltk.tokenize import wordpunct_tokenize #TOKENIZE WITH splits text on whitespace and punctuation (except for underscore)
- string ="It is the branch of data science that consists of systematic processes for analyzing, understanding, and how to driving information from the text data in a smart and efficient manner."
- tokens = wordpunct_tokenize(string)
- print(tokens) #tokenization complete
- # Exploring NLTK's stop words
- nltk.download('stopwords')
- from nltk.corpus import stopwords
- stopwords.readme().replace('\n', ' ') #we need to replace \n's with spaces for it to be readable.
- #find stopword in english
- stopwords.words('english')[:5]
- stopwords.words('greek')[:5]
- stopwords.words('french')[:5]
- stopwords.words('german')[:5]
- #count stop word use len()
- len(stopwords.words(['english']))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement