Advertisement
Guest User

Untitled

a guest
May 22nd, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.52 KB | None | 0 0
  1. import gensim.parsing.preprocessing as gsp
  2. from pyspark.sql.functions import udf
  3. from pyspark.sql.types import StringType
  4. from gensim import utils
  5.  
  6.  
  7. filters = [
  8. gsp.strip_tags,
  9. gsp.strip_punctuation,
  10. gsp.strip_multiple_whitespaces,
  11. gsp.strip_numeric,
  12. gsp.remove_stopwords,
  13. gsp.strip_short,
  14. gsp.stem_text
  15. ]
  16.  
  17. def clean_text(x):
  18. s = x[1]
  19. s = s.lower()
  20. s = utils.to_unicode(s)
  21. for f in filters:
  22. s = f(s)
  23. return (x[0],s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement