Guest User

Untitled

a guest
May 27th, 2018
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.24 KB | None | 0 0
  1. import stop_words
  2. from langdetect import detect
  3. import nltk
  4. nltk.download('stopwords')
  5. from nltk.corpus import stopwords
  6. import ast
  7.  
  8.  
  9. def get_all_stopwords(text_sample='This is an englisch sentence'):
  10. """ Combines Stopwords for englisch, german, french, and spanish from NLTK. Further adds stopwords from the stop_words module.
  11. Finally, stopwords from a text file stopwords.txt are added to come up with a large list of stopwords."""
  12. # detect language
  13. lang = detect(text_sample)
  14. print('DETECTED LANGUAGE : {}'.format(lang))
  15.  
  16. # get nltk stopwords for common languages
  17. stopwordssss = stopwords.words('german') + \
  18. stopwords.words('english') + \
  19. stopwords.words('french') + \
  20. stopwords.words('spanish')
  21.  
  22.  
  23. # read from stopwords.txt file
  24. aa = []
  25. with open('stopwords.txt', encoding='utf-8-sig') as f:
  26. aa.append(f.read())
  27. stopword_dict = ast.literal_eval(aa[0])
  28.  
  29. # join stop words from nltk, txt and from library stop_words
  30. stopwordss = set(stopwordssss) | set(stop_words.get_stop_words(lang)) |set(stopword_dict)
  31. stopwordlist = [*stopwordss]
  32.  
  33. return stopwordlist, lang
  34.  
  35.  
  36.  
  37. def _find_language(text):
  38. if text != '':
  39. return detect(text[:5000])
Add Comment
Please, Sign In to add comment