Untitled

import nltk
from nltk.tokenize import word_tokenize  #USE TO TOKENIZE TEXT.
from nltk.text import Text
import sys
from nltk.tokenize import wordpunct_tokenize   #TOKENIZE WITH splits text on whitespace and punctuation (except for underscore)
string ="It is the branch of data science that consists of systematic processes for analyzing, understanding, and how to driving information from the text data in a smart and efficient manner."
tokens = wordpunct_tokenize(string)
print(tokens)        #tokenization complete

# Exploring NLTK's stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.readme().replace('\n', ' ')   #we need to replace \n's with spaces for it to be readable.

#find stopword in english
stopwords.words('english')[:5]
stopwords.words('greek')[:5]
stopwords.words('french')[:5]
stopwords.words('german')[:5]

#count stop word  use len()
len(stopwords.words(['english']))