Advertisement
Guest User

Untitled

a guest
Dec 13th, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.70 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.pipeline import Pipeline, FeatureUnion
  4. from sklearn.preprocessing import OneHotEncoder, StandardScaler
  5. from sklearn.base import BaseEstimator, TransformerMixin
  6.  
  7.  
  8. class TypeSelector(BaseEstimator, TransformerMixin):
  9.     def __init__(self, dtype):
  10.         self.dtype = dtype
  11.        
  12.     def fit(self, X, y=None):
  13.         return self
  14.    
  15.     def transform(self, X):
  16.         assert isinstance(X, pd.DataFrame)
  17.         return X.select_dtypes(include=[self.dtype])
  18.    
  19.  
  20. class StringIndexer(BaseEstimator, TransformerMixin):
  21.     def fit(self, X, y=None):
  22.         return self
  23.     def transform(self, X):
  24.         assert isinstance(X, pd.DataFrame)
  25.         return X.apply(lambda s: s.cat.codes.replace(
  26.             {-1: len(s.cat.categories)}))
  27.        
  28.    
  29. transformer = Pipeline([
  30.     ('features', FeatureUnion(n_jobs=1, transformer_list=[
  31.         # Part 1
  32.         ('boolean', Pipeline([
  33.             ('selector', TypeSelector('bool')),
  34.         ])),  # booleans close
  35.        
  36.         ('numericals', Pipeline([
  37.             ('selector', TypeSelector(np.number)),
  38.             ('scaler', StandardScaler()),
  39.         ])),  # numericals close
  40.        
  41.         # Part 2
  42.         ('categoricals', Pipeline([
  43.             ('selector', TypeSelector('category')),
  44.             ('labeler', StringIndexer()),
  45.             ('encoder', OneHotEncoder(handle_unknown='ignore')),
  46.         ]))  # categoricals close
  47.     ])),  # features close
  48. ])  # pipeline close
  49.  
  50.  
  51.  
  52. df = pd.DataFrame({
  53.     'boolean_column': [True,False,True,False],
  54.     'integer_column': [1,2,3,4],
  55.     'float_column': [1.,2.,3.,4.]
  56. })
  57.  
  58.  
  59. df_transformed = transformer.fit_transform(df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement