Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import category_encoders as ce
- import xgboost as xgb
- from sklearn.impute import SimpleImputer
- from sklearn.compose import ColumnTransformer
- from bayes_opt import BayesianOptimization
- import warnings
- warnings.simplefilter(action='ignore', category=FutureWarning)
- train_df = pd.read_csv('data/train.csv')
- test_df = pd.read_csv('data/test.csv')
- def preprocess(frame):
- frame = drop_duplicates(frame)
- frame = drop_unwanted_columns(frame)
- frame = treat_NaN_values(frame)
- return frame
- def drop_duplicates(frame):
- frame.sort_values('Instance', inplace = True)
- frame.drop_duplicates('Instance', keep = 'first', inplace = True)
- return frame
- def drop_unwanted_columns(frame):
- frame = frame.drop('Instance', axis = 1)
- frame = frame.drop(columns = ['Housing Situation','Gender','Yearly Income in addition to Salary (e.g. Rental Income)','Wears Glasses','Hair Color'])
- return frame
- def treat_NaN_values(frame):
- frame = frame.replace('#NUM!', np.NaN)
- frame = frame.fillna(method = 'ffill')
- return frame
- train_df = preprocess(train_df).head(10)
- test_df = preprocess(test_df)
- income = train_df['Total Yearly Income [EUR]']
- print('beginning impute')
- ct = ColumnTransformer(transformers=[('numerical', SimpleImputer(strategy='median'), [0, 1, 2, 4, 6, 9]), ('categorical', SimpleImputer(strategy='most_frequent'), [3, 5, 7, 8])])
- ct.fit(train_df, income)
- print('transforming')
- train_df = ct.transform(train_df)
- test_df = ct.transform(test_df)
- print('encoding')
- enc = ce.TargetEncoder(cols=[6,7,8,9]).fit(train_df, income)
- train_df = enc.transform(train_df)
- print('building dmatrix')
- train_matrix = xgb.DMatrix(train_df, label=income)
- def bayesian_target(gamma, max_depth, colsample_bytree, subsample, learning_rate):
- cv_params = {'max_depth': int(max_depth),
- 'eval_metric': 'mae',
- 'eta': 0.1,
- 'subsample': subsample,
- 'gamma': gamma,
- 'colsample_bytree': colsample_bytree,
- 'n_jobs': -1,
- 'learning_rate': learning_rate
- }
- # cross validate
- cv = xgb.cv(cv_params, train_matrix, num_boost_round=1000, nfold=5)
- # BayesianOptimizer will target max error, so invert to make it target min error
- return -1 * cv['test-mae-mean'].iloc[-1]
- print('building bayesian optimizer')
- param_bounds = { 'gamma': (0, 1), 'max_depth': (3, 5), 'colsample_bytree': (0.3, 0.9), 'subsample': (0.5, 0.9), 'learning_rate': (0.01, 0.02) }
- xgb_bo = BayesianOptimization(bayesian_target, param_bounds)
- xgb_bo.maximize(init_points=10, n_iter=5, acq='ei', xi=0.0)
- best_params = xgb_bo.max['params']
- # XGBoost requires int so cast float value
- best_params['max_depth'] = int(best_params['max_depth'])
- print('training model')
- model = xgb.train(best_params, train_matrix, num_boost_round=250)
- # transform our testing data columns and to matrix for XGBoost model to predict
- test_df = enc.transform(test_df)
- test_df = xgb.DMatrix(test_df)
- predictions = model.predict(test_df)
- print('writing to file')
- # Instances saved to separate file for ease of access
- instances = pd.read_csv('data/instances.csv')['Instance'].values
- f = open("data/submission.csv", "w")
- # Write to File
- f.write("Instance,Total Yearly Income [EUR]\n")
- for i in range(len(predictions)):
- f.write(str(instances[i]) + "," + str(predictions[i]) + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement