Advertisement
tony_raven

full

Mar 26th, 2019
175
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.40 KB | None | 0 0
  1. from sklearn.base import BaseEstimator, TransformerMixin
  2. from sklearn.pipeline import Pipeline, FeatureUnion
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.preprocessing import LabelEncoder
  5. import pandas as pd
  6. import numpy as np
  7.  
  8. class FeatureSelector(BaseEstimator, TransformerMixin):
  9. def __init__(self, feature_name):
  10. self.feature_name = feature_name
  11.  
  12. def fit(self, X, y=None):
  13. return self
  14.  
  15. def transform(self, X):
  16. return X.loc[:, self.feature_name]
  17.  
  18. class CustomBinTransformer(BaseEstimator, TransformerMixin):
  19. def __init__(self, ranges):
  20. self.ranges = ranges
  21.  
  22. def fit(self, X, y=None):
  23. return self
  24.  
  25. def transform(self, X):
  26. # тут пишешь деление по бинам, можешь использовать стандартные, если подойдут из sklearn. Пишу лишь как пример:
  27. return pd.cut(X, self.ranges, labels=False).values[:, np.newaxis]
  28.  
  29.  
  30. model = RandomForestClassifier() # Тут для примера твоя модель
  31.  
  32. features = FeatureUnion([
  33. ('НБКИ+', Pipeline([
  34. ('selector', FeatureSelector('НБКИ+')),
  35. ('bining', CustomBinTransformer([0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000])) # Списком просто передаю диапазноы для бининга
  36. ])
  37. ),
  38. ('ОКБ+', Pipeline([
  39. ('selector', FeatureSelector('ОКБ+')),
  40. ('bining', CustomBinTransformer([0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]))
  41. ])
  42. ),
  43. ('Эквифакс+', Pipeline([
  44. ('selector', FeatureSelector('Эквифакс+')),
  45. ('bining', CustomBinTransformer([0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]))
  46. ])
  47. ),
  48. ('mail.ru+', Pipeline([
  49. ('selector', FeatureSelector('mail.ru+')),
  50. ('bining', CustomBinTransformer([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
  51. ])
  52. ),
  53. ])
  54.  
  55. pipeline = Pipeline([
  56. ('features', features),
  57. ('estimator', model)
  58. ])
  59. df = pd.DataFrame(np.random.rand(10, 5), columns=['НБКИ+', 'ОКБ+', 'Эквифакс+', 'mail.ru+', 'target'])
  60. df.iloc[:, :3] = df.iloc[:, :3] * 1000
  61. df['target'] = (df['target'] < 0.5).astype(np.int)
  62. pipeline.fit(df, df['target'])
  63. pipeline.predict(df)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement