Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def clean_post(post):
- """This function takes the posts dataframe as input and produce a new dataframe with two new
- columns: words (just tokenization of body) and words_filt (words filtered from stopwords)"""
- #define and apply regex
- regTok = RegexTokenizer(inputCol="body", outputCol="words", pattern="\\W")
- post_tok = regTok.transform(post)
- #define and apply stopwords
- remover = StopWordsRemover(inputCol="words", outputCol="words_filt")
- remover.loadDefaultStopWords('english')
- post_words = remover.transform(post_tok)
- return post_words
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement