Advertisement
Guest User

Untitled

a guest
Feb 28th, 2017
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.74 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.ensemble import RandomForestClassifier
  3. from time import time
  4. from scipy.stats import randint as sp_randint
  5. from sklearn.model_selection import RandomizedSearchCV
  6. import numpy as np
  7.  
  8. def get_trained_random_forest(train_df, train_target, test_df, test_target, n_iter=20):
  9. """
  10. :param: train_df : pandas.DataFrame.
  11. Table containing training data (explanatory variables).
  12. :param: train_target : pandas.Series or a list.
  13. Dependent variable that contains labels of every data point in train_df.
  14. :param: test_df : pandas.DataFrame.
  15. Table containing data for testing (explanatory variables).
  16. :param: test_target : pandas.Series or a list.
  17. Dependent variable that contains labels of every data point in test_df.
  18. :param: n_iter : int.
  19. Number of configurations (parameter settings) of a Random Forest that are sampled.
  20. """
  21.  
  22. # Utility function to report best scores
  23. def report(results, n_top=3):
  24. for i in range(1, n_top + 1):
  25. candidates = np.flatnonzero(results['rank_test_score'] == i)
  26. for candidate in candidates:
  27. print("Model with rank: {0}".format(i))
  28. print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
  29. results['mean_test_score'][candidate],
  30. results['std_test_score'][candidate]))
  31. print("Parameters: {0}".format(results['params'][candidate]))
  32. print("")
  33.  
  34. # build a classifier for searching optimal parameters
  35. clf = RandomForestClassifier(n_jobs=-1 )
  36.  
  37. # specify parameters and distributions to sample from
  38. param_dist = {"n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
  39. # "max_depth": [3,5,8,10,None],
  40. "min_samples_split": sp_randint(2, 11),
  41. "min_samples_leaf": sp_randint(1, 11),
  42. "bootstrap": [True, False],
  43. "criterion": ["gini", "entropy"]}
  44.  
  45. # run randomized search
  46. random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter)
  47.  
  48. start = time()
  49. random_search.fit(train_df, train_target)
  50. print("RandomizedSearchCV took %.2f seconds for %d candidates"
  51. " parameter settings.\n" % ((time() - start), n_iter_search))
  52. report(random_search.cv_results_)
  53.  
  54. # build a real classifier for predictions
  55. results = random_search.cv_results_
  56. best_config = np.flatnonzero(results['rank_test_score'] == 1)[0]
  57. model_params = results['params'][best_config]
  58. model_params['n_jobs'] = -1
  59.  
  60. model = RandomForestClassifier(**model_params)
  61. model.fit(train_df, train_target)
  62. print('\nTest score: ', model.score(test_df, test_target))
  63.  
  64. return model
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement