Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.ensemble import RandomForestClassifier
- from time import time
- from scipy.stats import randint as sp_randint
- from sklearn.model_selection import RandomizedSearchCV
- import numpy as np
- def get_trained_random_forest(train_df, train_target, test_df, test_target, n_iter=20):
- """
- :param: train_df : pandas.DataFrame.
- Table containing training data (explanatory variables).
- :param: train_target : pandas.Series or a list.
- Dependent variable that contains labels of every data point in train_df.
- :param: test_df : pandas.DataFrame.
- Table containing data for testing (explanatory variables).
- :param: test_target : pandas.Series or a list.
- Dependent variable that contains labels of every data point in test_df.
- :param: n_iter : int.
- Number of configurations (parameter settings) of a Random Forest that are sampled.
- """
- # Utility function to report best scores
- def report(results, n_top=3):
- for i in range(1, n_top + 1):
- candidates = np.flatnonzero(results['rank_test_score'] == i)
- for candidate in candidates:
- print("Model with rank: {0}".format(i))
- print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
- results['mean_test_score'][candidate],
- results['std_test_score'][candidate]))
- print("Parameters: {0}".format(results['params'][candidate]))
- print("")
- # build a classifier for searching optimal parameters
- clf = RandomForestClassifier(n_jobs=-1 )
- # specify parameters and distributions to sample from
- param_dist = {"n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
- # "max_depth": [3,5,8,10,None],
- "min_samples_split": sp_randint(2, 11),
- "min_samples_leaf": sp_randint(1, 11),
- "bootstrap": [True, False],
- "criterion": ["gini", "entropy"]}
- # run randomized search
- random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter)
- start = time()
- random_search.fit(train_df, train_target)
- print("RandomizedSearchCV took %.2f seconds for %d candidates"
- " parameter settings.\n" % ((time() - start), n_iter_search))
- report(random_search.cv_results_)
- # build a real classifier for predictions
- results = random_search.cv_results_
- best_config = np.flatnonzero(results['rank_test_score'] == 1)[0]
- model_params = results['params'][best_config]
- model_params['n_jobs'] = -1
- model = RandomForestClassifier(**model_params)
- model.fit(train_df, train_target)
- print('\nTest score: ', model.score(test_df, test_target))
- return model
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement