Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.ml.feature import StringIndexer
- from pyspark.ml.feature import OneHotEncoder
- # ...
- def one_hot_encode(_df, input_column, output_column):
- indexer = StringIndexer(inputCol=input_column, outputCol=input_column+"_indexed", handleInvalid='skip')
- _model = indexer.fit(_df)
- _td = _model.transform(_df)
- encoder = OneHotEncoder(inputCol=input_column+"_indexed", outputCol=output_column, dropLast=True)
- _df2 = encoder.transform(_td)
- return _df2
Add Comment
Please, Sign In to add comment