Untitled

from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
def cat_cv(alg, X_train, y_train,cat_feat_pos , n=3):
    skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True )
    errors = []
    for train_index, test_index in skf.split(X_train, y_train):
        train, test = X_train.iloc[train_index], X_train.iloc[test_index]
        target_train, target_test = y_train.iloc[train_index], y_train.iloc[test_index]
        alg.fit(train, target_train, eval_set=(test, target_test),cat_features=cat_feat_pos)
        error = 1 - average_precision_score(target_test, alg.predict_proba(test).T[1])
#         error = alg.get_best_score()['learn']['Logloss']
        n_trees = alg.get_best_iteration()
        errors.append(error)
    return np.mean(np.array(errors)), n_trees


global ITERATION
def objective(params, early_stopping_rounds=50):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""

    # Keep track of evals
    global ITERATION
    ITERATION += 1

    # Make sure parameters that need to be integers are integers
    for parameter_name in ['max_depth']:
        params[parameter_name] = int(params[parameter_name])

    start = timer()
    # Perform n_folds cross validation
    cat = CatBoostClassifier(iterations=5000, verbose=0,
                        has_time=True, use_best_model=True, **params)

    cv_results, n_estimators = cat_cv(cat, X_train, y_train, categorical_features_pos)
    run_time = timer() - start
    # Extract the best score
    best_score = 1 - cv_results

    # Loss must be minimized
    loss = cv_results

    # Boosting rounds that returned the highest cv score


    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'iterations': n_estimators, 'train_time': run_time, 'status': STATUS_OK}


space = {'loss_function':'Logloss',
                           'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                           'max_depth':hp.quniform('max_depth', 3, 10, 1),
                           'eval_metric':'F1',
                           'bootstrap_type':'Bernoulli',
                           'subsample':0.5,
                           'l2_leaf_reg':hp.uniform('l2_leaf_reg', 0.0, 1.0),
                           'random_seed':42,
                            'od_type':'IncToDec',
                           'od_pval':0.01,
                           'max_ctr_complexity':1,
                           'random_strength':hp.uniform('random_strength', 0.0, 1),
                           'leaf_estimation_method':'Newton',
                           'class_weights':[df.groupby('target').count().balance_amt_rub[1], df.groupby('target').count().balance_amt_rub[0]]
                           }


def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
categorical_features_pos = column_index(X_train,cat_features)


# File to save first results
out_file = 'cat_new_feat_ap.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()


ITERATION = 0
bayes_trials = Trials()
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest,
            max_evals = 30, trials = bayes_trials, rstate =np.random.RandomState(42))