SHARE
TWEET

Untitled

a guest Dec 13th, 2018 62 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.pipeline import Pipeline, FeatureUnion
  4. from sklearn.preprocessing import OneHotEncoder, StandardScaler
  5. from sklearn.base import BaseEstimator, TransformerMixin
  6.  
  7.  
  8. class TypeSelector(BaseEstimator, TransformerMixin):
  9.     def __init__(self, dtype):
  10.         self.dtype = dtype
  11.        
  12.     def fit(self, X, y=None):
  13.         return self
  14.    
  15.     def transform(self, X):
  16.         assert isinstance(X, pd.DataFrame)
  17.         return X.select_dtypes(include=[self.dtype])
  18.    
  19.  
  20. class StringIndexer(BaseEstimator, TransformerMixin):
  21.     def fit(self, X, y=None):
  22.         return self
  23.     def transform(self, X):
  24.         assert isinstance(X, pd.DataFrame)
  25.         return X.apply(lambda s: s.cat.codes.replace(
  26.             {-1: len(s.cat.categories)}))
  27.        
  28.    
  29. transformer = Pipeline([
  30.     ('features', FeatureUnion(n_jobs=1, transformer_list=[
  31.         # Part 1
  32.         ('boolean', Pipeline([
  33.             ('selector', TypeSelector('bool')),
  34.         ])),  # booleans close
  35.        
  36.         ('numericals', Pipeline([
  37.             ('selector', TypeSelector(np.number)),
  38.             ('scaler', StandardScaler()),
  39.         ])),  # numericals close
  40.        
  41.         # Part 2
  42.         ('categoricals', Pipeline([
  43.             ('selector', TypeSelector('category')),
  44.             ('labeler', StringIndexer()),
  45.             ('encoder', OneHotEncoder(handle_unknown='ignore')),
  46.         ]))  # categoricals close
  47.     ])),  # features close
  48. ])  # pipeline close
  49.  
  50.  
  51.  
  52. df = pd.DataFrame({
  53.     'boolean_column': [True,False,True,False],
  54.     'integer_column': [1,2,3,4],
  55.     'float_column': [1.,2.,3.,4.]
  56. })
  57.  
  58.  
  59. df_transformed = transformer.fit_transform(df)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top