Advertisement
Guest User

Untitled

a guest
Sep 22nd, 2019
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.76 KB | None | 0 0
  1. from sklearn.metrics import average_precision_score, roc_auc_score
  2. from sklearn.model_selection import StratifiedKFold
  3. def cat_cv(alg, X_train, y_train,cat_feat_pos , n=3):
  4.     skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True )
  5.     errors = []
  6.     for train_index, test_index in skf.split(X_train, y_train):
  7.         train, test = X_train.iloc[train_index], X_train.iloc[test_index]
  8.         target_train, target_test = y_train.iloc[train_index], y_train.iloc[test_index]
  9.         alg.fit(train, target_train, eval_set=(test, target_test),cat_features=cat_feat_pos)
  10.         error = 1 - average_precision_score(target_test, alg.predict_proba(test).T[1])
  11. #         error = alg.get_best_score()['learn']['Logloss']
  12.         n_trees = alg.get_best_iteration()
  13.         errors.append(error)
  14.     return np.mean(np.array(errors)), n_trees
  15.  
  16.  
  17.  
  18. global ITERATION
  19. def objective(params, early_stopping_rounds=50):
  20.     """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
  21.  
  22.     # Keep track of evals
  23.     global ITERATION
  24.     ITERATION += 1
  25.  
  26.     # Make sure parameters that need to be integers are integers
  27.     for parameter_name in ['max_depth']:
  28.         params[parameter_name] = int(params[parameter_name])
  29.  
  30.     start = timer()
  31.     # Perform n_folds cross validation
  32.     cat = CatBoostClassifier(iterations=5000, verbose=0,
  33.                         has_time=True, use_best_model=True, **params)
  34.  
  35.     cv_results, n_estimators = cat_cv(cat, X_train, y_train, categorical_features_pos)
  36.     run_time = timer() - start
  37.     # Extract the best score
  38.     best_score = 1 - cv_results
  39.  
  40.     # Loss must be minimized
  41.     loss = cv_results
  42.  
  43.     # Boosting rounds that returned the highest cv score
  44.  
  45.  
  46.     # Write to the csv file ('a' means append)
  47.     of_connection = open(out_file, 'a')
  48.     writer = csv.writer(of_connection)
  49.     writer.writerow([loss, params, ITERATION, n_estimators, run_time])
  50.  
  51.     # Dictionary with information for evaluation
  52.     return {'loss': loss, 'params': params, 'iteration': ITERATION,
  53.             'iterations': n_estimators, 'train_time': run_time, 'status': STATUS_OK}
  54.  
  55.  
  56.  
  57. space = {'loss_function':'Logloss',
  58.                            'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
  59.                            'max_depth':hp.quniform('max_depth', 3, 10, 1),
  60.                            'eval_metric':'F1',
  61.                            'bootstrap_type':'Bernoulli',
  62.                            'subsample':0.5,
  63.                            'l2_leaf_reg':hp.uniform('l2_leaf_reg', 0.0, 1.0),
  64.                            'random_seed':42,
  65.                             'od_type':'IncToDec',
  66.                            'od_pval':0.01,
  67.                            'max_ctr_complexity':1,
  68.                            'random_strength':hp.uniform('random_strength', 0.0, 1),
  69.                            'leaf_estimation_method':'Newton',
  70.                            'class_weights':[df.groupby('target').count().balance_amt_rub[1], df.groupby('target').count().balance_amt_rub[0]]
  71.                            }
  72.  
  73.  
  74. def column_index(df, query_cols):
  75.     cols = df.columns.values
  76.     sidx = np.argsort(cols)
  77.     return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]
  78. categorical_features_pos = column_index(X_train,cat_features)
  79.  
  80.  
  81.  
  82. # File to save first results
  83. out_file = 'cat_new_feat_ap.csv'
  84. of_connection = open(out_file, 'w')
  85. writer = csv.writer(of_connection)
  86.  
  87. # Write the headers to the file
  88. writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
  89. of_connection.close()
  90.  
  91.  
  92. ITERATION = 0
  93. bayes_trials = Trials()
  94. # Run optimization
  95. best = fmin(fn = objective, space = space, algo = tpe.suggest,
  96.             max_evals = 30, trials = bayes_trials, rstate =np.random.RandomState(42))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement