Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.preprocessing import MultiLabelBinarizer
- mlb = MultiLabelBinarizer()
- # load sample data
- df = pd.DataFrame( {'user_id':['1','1','2','2','2','3'], 'fruits':['banana','orange','orange','apple','banana','mango']})
- # collect fruits for each user
- transformed_df= df.groupby('user_id').agg({'fruits':lambda x: list(x)}).reset_index()
- print(transformed_df)
- user_id fruits
- 0 1 [banana, orange]
- 1 2 [orange, apple, banana]
- 2 3 [mango]
- # perform MultiLabelBinarizer
- final_df = transformed_df.join(pd.DataFrame(mlb.fit_transform(transformed_df.pop('fruits')),columns=mlb.classes_,index=transformed_df.index))
- print(final_df)
- user_id apple banana mango orange
- 0 1 0 1 0 1
- 1 2 1 1 0 1
- 2 3 0 0 1 0
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement