Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2019
245
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.28 KB | None | 0 0
  1. import string
  2. from gensim import corpora
  3.  
  4. import nltk
  5. from nltk.stem.wordnet import WordNetLemmatizer
  6. from nltk.corpus import stopwords, wordnet
  7. # nltk.download('stopwords')
  8. # nltk.download('wordnet')
  9.  
  10. class Cleaner:
  11.  
  12. def __init__(self):
  13. # Punctuations and stopwords
  14. self.punctuation = set(string.punctuation)
  15. self.stoplist = set(stopwords.words('english'))
  16.  
  17. # LDA
  18. self.dictionary = corpora.Dictionary()
  19. self.lemma = WordNetLemmatizer()
  20.  
  21. def remove_punctuation(self, text):
  22. return ''.join([char for char in text if char not in self.punctuation])
  23.  
  24.  
  25. def remove_numbers(self, text):
  26. return ''.join([char for char in text if not char.isdigit()])
  27.  
  28.  
  29. def remove_stopwords(self, text):
  30. return ' '.join([word for word in text.split() if word not in self.stoplist])
  31.  
  32.  
  33. def remove_single_chars(self, text):
  34. return ' '.join([word for word in text.split() if len(word) > 1])
  35.  
  36.  
  37. def lemmatize(self, text):
  38. return ' '.join([self.lemma.lemmatize(word) for word in text.split()])
  39.  
  40.  
  41. def clean_text(self, text):
  42. text = text.replace('\n', '')
  43. text = self.remove_punctuation(text)
  44. text = self.remove_numbers(text)
  45. text = self.remove_stopwords(text)
  46. text = self.remove_single_chars(text)
  47. text = self.lemmatize(text)
  48. return text
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement