Advertisement
Guest User

Untitled

a guest
Jul 26th, 2017
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.28 KB | None | 0 0
  1. from nltk.corpus import stopwords
  2.  
  3. def removeStopwords( palabras ):
  4. return [ word for word in palabras if word not in stopwords.words('spanish') ]
  5.  
  6. palabras = ''' my text is here '''
  7.  
  8. >>> palabras = "Buenos dias"
  9. >>> [c for c in palabras]
  10. ['B', 'u', 'e', 'n', 'a', 's', ' ', 'd', 'i', 'a', 's']
  11.  
  12. import re
  13. from nltk.corpus import stopwords
  14.  
  15. scentence = 'El problema del matrimonio es que se acaba todas las noches despues de hacer el amor, y hay que volver a reconstruirlo todas las mananas antes del desayuno.'
  16.  
  17. #We only want to work with lowercase for the comparisons
  18. scentence = scentence.lower()
  19.  
  20. #remove punctuation and split into seperate words
  21. words = re.findall(r'w+', scentence,flags = re.UNICODE | re.LOCALE)
  22.  
  23. #This is the simple way to remove stop words
  24. important_words=[]
  25. for word in words:
  26. if word not in stopwords.words('spanish'):
  27. important_words.append(word)
  28.  
  29. print important_words
  30.  
  31. #This is the more pythonic way
  32. important_words = filter(lambda x: x not in stopwords.words('spanish'), words)
  33.  
  34. print important_words
  35.  
  36. def remove_stopwords(sentence, language):
  37. return [ token for token in nltk.word_tokenize(sentence) if token.lower() not in stopwords.words(language) ]
  38.  
  39. >>> import nltk
  40. >>> nltk.download()
  41. showing info http://nltk.github.com/nltk_data
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement