Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.pipeline import Pipeline, FeatureUnion
- from sklearn.preprocessing import OneHotEncoder, StandardScaler
- from sklearn.base import BaseEstimator, TransformerMixin
- class TypeSelector(BaseEstimator, TransformerMixin):
- def __init__(self, dtype):
- self.dtype = dtype
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- assert isinstance(X, pd.DataFrame)
- return X.select_dtypes(include=[self.dtype])
- class StringIndexer(BaseEstimator, TransformerMixin):
- def fit(self, X, y=None):
- return self
- def transform(self, X):
- assert isinstance(X, pd.DataFrame)
- return X.apply(lambda s: s.cat.codes.replace(
- {-1: len(s.cat.categories)}))
- transformer = Pipeline([
- ('features', FeatureUnion(n_jobs=1, transformer_list=[
- # Part 1
- ('boolean', Pipeline([
- ('selector', TypeSelector('bool')),
- ])), # booleans close
- ('numericals', Pipeline([
- ('selector', TypeSelector(np.number)),
- ('scaler', StandardScaler()),
- ])), # numericals close
- # Part 2
- ('categoricals', Pipeline([
- ('selector', TypeSelector('category')),
- ('labeler', StringIndexer()),
- ('encoder', OneHotEncoder(handle_unknown='ignore')),
- ])) # categoricals close
- ])), # features close
- ]) # pipeline close
- df = pd.DataFrame({
- 'boolean_column': [True,False,True,False],
- 'integer_column': [1,2,3,4],
- 'float_column': [1.,2.,3.,4.]
- })
- df_transformed = transformer.fit_transform(df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement