Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gensim.parsing.preprocessing as gsp
- from pyspark.sql.functions import udf
- from pyspark.sql.types import StringType
- from gensim import utils
- filters = [
- gsp.strip_tags,
- gsp.strip_punctuation,
- gsp.strip_multiple_whitespaces,
- gsp.strip_numeric,
- gsp.remove_stopwords,
- gsp.strip_short,
- gsp.stem_text
- ]
- def clean_text(x):
- s = x[1]
- s = s.lower()
- s = utils.to_unicode(s)
- for f in filters:
- s = f(s)
- return (x[0],s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement