Guest User

Untitled

a guest
Jul 16th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.68 KB | None | 0 0
  1. # Split text into words
  2. from nltk.tokenize import word_tokenize
  3. tokens = word_tokenize(text)
  4.  
  5. # Convert words to lower case
  6. tokens = [w.lower() for w in tokens]
  7.  
  8. # Remove punctuation from each word
  9. import string
  10. table = str.maketrans('', '', string.punctuation)
  11. stripped = [w.translate(table) for w in tokens]
  12.  
  13. # Remove tokens that are not alphabetic
  14. words = [word for word in stripped if word.isalpha()]
  15.  
  16. # Filter out stop words
  17. from nltk.corpus import stopwords
  18. stop_words = set(stopwords.words('english'))
  19. words = [w for w in words if not w in stop_words]
  20.  
  21. # Stemming of words
  22. from nltk.stem.porter import PorterStemmer
  23. porter = PorterStemmer()
  24. stemmed = [porter.stem(word) for word in tokens]
Add Comment
Please, Sign In to add comment