Advertisement
Guest User

Untitled

a guest
Nov 18th, 2019
124
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.42 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import category_encoders as ce
  4. import xgboost as xgb
  5.  
  6. from sklearn.impute import SimpleImputer
  7. from sklearn.compose import ColumnTransformer
  8. from bayes_opt import BayesianOptimization
  9. import warnings
  10. warnings.simplefilter(action='ignore', category=FutureWarning)
  11.  
  12. train_df = pd.read_csv('data/train.csv')
  13. test_df = pd.read_csv('data/test.csv')
  14.  
  15. def preprocess(frame):
  16. frame = drop_duplicates(frame)
  17. frame = drop_unwanted_columns(frame)
  18. frame = treat_NaN_values(frame)
  19. return frame
  20.  
  21. def drop_duplicates(frame):
  22. frame.sort_values('Instance', inplace = True)
  23. frame.drop_duplicates('Instance', keep = 'first', inplace = True)
  24. return frame
  25.  
  26. def drop_unwanted_columns(frame):
  27. frame = frame.drop('Instance', axis = 1)
  28. frame = frame.drop(columns = ['Housing Situation','Gender','Yearly Income in addition to Salary (e.g. Rental Income)','Wears Glasses','Hair Color'])
  29. return frame
  30.  
  31. def treat_NaN_values(frame):
  32. frame = frame.replace('#NUM!', np.NaN)
  33. frame = frame.fillna(method = 'ffill')
  34. return frame
  35.  
  36. train_df = preprocess(train_df).head(10)
  37. test_df = preprocess(test_df)
  38.  
  39. income = train_df['Total Yearly Income [EUR]']
  40.  
  41. print('beginning impute')
  42. ct = ColumnTransformer(transformers=[('numerical', SimpleImputer(strategy='median'), [0, 1, 2, 4, 6, 9]), ('categorical', SimpleImputer(strategy='most_frequent'), [3, 5, 7, 8])])
  43. ct.fit(train_df, income)
  44.  
  45. print('transforming')
  46. train_df = ct.transform(train_df)
  47. test_df = ct.transform(test_df)
  48.  
  49. print('encoding')
  50. enc = ce.TargetEncoder(cols=[6,7,8,9]).fit(train_df, income)
  51. train_df = enc.transform(train_df)
  52.  
  53. print('building dmatrix')
  54. train_matrix = xgb.DMatrix(train_df, label=income)
  55.  
  56. def bayesian_target(gamma, max_depth, colsample_bytree, subsample, learning_rate):
  57. cv_params = {'max_depth': int(max_depth),
  58. 'eval_metric': 'mae',
  59. 'eta': 0.1,
  60. 'subsample': subsample,
  61. 'gamma': gamma,
  62. 'colsample_bytree': colsample_bytree,
  63. 'n_jobs': -1,
  64. 'learning_rate': learning_rate
  65. }
  66.  
  67. # cross validate
  68. cv = xgb.cv(cv_params, train_matrix, num_boost_round=1000, nfold=5)
  69. # BayesianOptimizer will target max error, so invert to make it target min error
  70. return -1 * cv['test-mae-mean'].iloc[-1]
  71.  
  72. print('building bayesian optimizer')
  73.  
  74. param_bounds = { 'gamma': (0, 1), 'max_depth': (3, 5), 'colsample_bytree': (0.3, 0.9), 'subsample': (0.5, 0.9), 'learning_rate': (0.01, 0.02) }
  75. xgb_bo = BayesianOptimization(bayesian_target, param_bounds)
  76.  
  77. xgb_bo.maximize(init_points=10, n_iter=5, acq='ei', xi=0.0)
  78.  
  79. best_params = xgb_bo.max['params']
  80. # XGBoost requires int so cast float value
  81. best_params['max_depth'] = int(best_params['max_depth'])
  82.  
  83. print('training model')
  84. model = xgb.train(best_params, train_matrix, num_boost_round=250)
  85.  
  86. # transform our testing data columns and to matrix for XGBoost model to predict
  87. test_df = enc.transform(test_df)
  88. test_df = xgb.DMatrix(test_df)
  89.  
  90. predictions = model.predict(test_df)
  91.  
  92. print('writing to file')
  93. # Instances saved to separate file for ease of access
  94. instances = pd.read_csv('data/instances.csv')['Instance'].values
  95. f = open("data/submission.csv", "w")
  96.  
  97. # Write to File
  98. f.write("Instance,Total Yearly Income [EUR]\n")
  99.  
  100. for i in range(len(predictions)):
  101. f.write(str(instances[i]) + "," + str(predictions[i]) + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement