Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.spark.ml.feature.RegexTokenizer
- val regexTokenizer = new RegexTokenizer()
- .setInputCol("reviewText")
- .setOutputCol("words")
- .setPattern("\\p{Digit}|\\p{Space}|[\\p{Punct}&&[^']]|(?<![a-zA-Z])'|'(?![a-zA-Z])|\\“")
- val tokenized = regexTokenizer.transform(dfNew)
- tokenized.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement