Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.metrics import average_precision_score, roc_auc_score
- from sklearn.model_selection import StratifiedKFold
- def cat_cv(alg, X_train, y_train,cat_feat_pos , n=3):
- skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True )
- errors = []
- for train_index, test_index in skf.split(X_train, y_train):
- train, test = X_train.iloc[train_index], X_train.iloc[test_index]
- target_train, target_test = y_train.iloc[train_index], y_train.iloc[test_index]
- alg.fit(train, target_train, eval_set=(test, target_test),cat_features=cat_feat_pos)
- error = 1 - average_precision_score(target_test, alg.predict_proba(test).T[1])
- # error = alg.get_best_score()['learn']['Logloss']
- n_trees = alg.get_best_iteration()
- errors.append(error)
- return np.mean(np.array(errors)), n_trees
- global ITERATION
- def objective(params, early_stopping_rounds=50):
- """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
- # Keep track of evals
- global ITERATION
- ITERATION += 1
- # Make sure parameters that need to be integers are integers
- for parameter_name in ['max_depth']:
- params[parameter_name] = int(params[parameter_name])
- start = timer()
- # Perform n_folds cross validation
- cat = CatBoostClassifier(iterations=5000, verbose=0,
- has_time=True, use_best_model=True, **params)
- cv_results, n_estimators = cat_cv(cat, X_train, y_train, categorical_features_pos)
- run_time = timer() - start
- # Extract the best score
- best_score = 1 - cv_results
- # Loss must be minimized
- loss = cv_results
- # Boosting rounds that returned the highest cv score
- # Write to the csv file ('a' means append)
- of_connection = open(out_file, 'a')
- writer = csv.writer(of_connection)
- writer.writerow([loss, params, ITERATION, n_estimators, run_time])
- # Dictionary with information for evaluation
- return {'loss': loss, 'params': params, 'iteration': ITERATION,
- 'iterations': n_estimators, 'train_time': run_time, 'status': STATUS_OK}
- space = {'loss_function':'Logloss',
- 'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
- 'max_depth':hp.quniform('max_depth', 3, 10, 1),
- 'eval_metric':'F1',
- 'bootstrap_type':'Bernoulli',
- 'subsample':0.5,
- 'l2_leaf_reg':hp.uniform('l2_leaf_reg', 0.0, 1.0),
- 'random_seed':42,
- 'od_type':'IncToDec',
- 'od_pval':0.01,
- 'max_ctr_complexity':1,
- 'random_strength':hp.uniform('random_strength', 0.0, 1),
- 'leaf_estimation_method':'Newton',
- 'class_weights':[df.groupby('target').count().balance_amt_rub[1], df.groupby('target').count().balance_amt_rub[0]]
- }
- def column_index(df, query_cols):
- cols = df.columns.values
- sidx = np.argsort(cols)
- return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
- categorical_features_pos = column_index(X_train,cat_features)
- # File to save first results
- out_file = 'cat_new_feat_ap.csv'
- of_connection = open(out_file, 'w')
- writer = csv.writer(of_connection)
- # Write the headers to the file
- writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
- of_connection.close()
- ITERATION = 0
- bayes_trials = Trials()
- # Run optimization
- best = fmin(fn = objective, space = space, algo = tpe.suggest,
- max_evals = 30, trials = bayes_trials, rstate =np.random.RandomState(42))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement