Advertisement
Guest User

Untitled

a guest
Feb 19th, 2019
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.05 KB | None | 0 0
  1. #Find best k numeric features (Wrapper Method).
  2.  
  3. def plot_hists(dataset, nrows, ncols, title=""):
  4. original_cols = list(dataset.columns)
  5. plt.figure(figsize=(12, 16))
  6. for i in range(len(original_cols)):
  7. colname = original_cols[i]
  8. plt.subplot(nrows, ncols, i+1)
  9. plt.title(colname)
  10. dataset[colname].plot.hist(bins=200)
  11.  
  12. plt.suptitle(title)
  13. plt.tight_layout()
  14. plt.subplots_adjust(top=0.95)
  15. plt.show()
  16.  
  17. def transform_data(dataset):
  18. original_cols = list(dataset.columns)
  19. power_trans = PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
  20. transf_data = power_trans.fit_transform(dataset)
  21. df = pd.DataFrame(data=transf_data, columns=original_cols)
  22. return df
  23.  
  24. all_numeric_features = [
  25. 'flesch_reading_ease',
  26. 'smog_index',
  27. 'flesch_kincaid_grade',
  28. 'coleman_liau_index',
  29. 'automated_readability_index',
  30. 'dale_chall_readability_score',
  31. 'difficult_words',
  32. 'linsear_write_formula',
  33. 'gunning_fog',
  34. 'commentCount',
  35. 'userHelpfulness',
  36. 'body_length',
  37. 'body_num_words',
  38. 'num_spelling_mistakes',
  39. 'total_helpful_votes_for_user',
  40. 'total_votes_for_user',
  41. 'reviewer_avg_rating',
  42. 'subjectivity_avg',
  43. 'subjectivity_std',
  44. 'polarity_avg',
  45. 'polarity_std',
  46. ]
  47.  
  48. train_set = data[data['review_date'] < train_max_date]
  49. numeric_X_train = train_set[all_numeric_features].astype('f').reset_index(drop=True)
  50. numeric_Y_train = train_set['helpful'].reset_index(drop=True)
  51.  
  52. # Transform data.
  53. #plot_hists(numeric_X_train, 7, 3, "Original Distributions")
  54. numeric_X_train = transform_data(numeric_X_train)
  55. #plot_hists(numeric_X_train, 7, 3, "Modified Distributions")
  56.  
  57. sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=20, max_depth=4),
  58. k_features=10, forward=False, verbose=2, scoring='f1', cv=3)
  59. sfs = sfs.fit(numeric_X_train.values, numeric_Y_train)
  60. best_k_numeric = [all_numeric_features[feature_index] for feature_index in sfs.k_feature_idx_]
  61.  
  62. print('\nSequential Forward Selection Result:')
  63. print(best_k_numeric)
  64. print('CV Score:', sfs.k_score_)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement