Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Find best k numeric features (Wrapper Method).
- def plot_hists(dataset, nrows, ncols, title=""):
- original_cols = list(dataset.columns)
- plt.figure(figsize=(12, 16))
- for i in range(len(original_cols)):
- colname = original_cols[i]
- plt.subplot(nrows, ncols, i+1)
- plt.title(colname)
- dataset[colname].plot.hist(bins=200)
- plt.suptitle(title)
- plt.tight_layout()
- plt.subplots_adjust(top=0.95)
- plt.show()
- def transform_data(dataset):
- original_cols = list(dataset.columns)
- power_trans = PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
- transf_data = power_trans.fit_transform(dataset)
- df = pd.DataFrame(data=transf_data, columns=original_cols)
- return df
- all_numeric_features = [
- 'flesch_reading_ease',
- 'smog_index',
- 'flesch_kincaid_grade',
- 'coleman_liau_index',
- 'automated_readability_index',
- 'dale_chall_readability_score',
- 'difficult_words',
- 'linsear_write_formula',
- 'gunning_fog',
- 'commentCount',
- 'userHelpfulness',
- 'body_length',
- 'body_num_words',
- 'num_spelling_mistakes',
- 'total_helpful_votes_for_user',
- 'total_votes_for_user',
- 'reviewer_avg_rating',
- 'subjectivity_avg',
- 'subjectivity_std',
- 'polarity_avg',
- 'polarity_std',
- ]
- train_set = data[data['review_date'] < train_max_date]
- numeric_X_train = train_set[all_numeric_features].astype('f').reset_index(drop=True)
- numeric_Y_train = train_set['helpful'].reset_index(drop=True)
- # Transform data.
- #plot_hists(numeric_X_train, 7, 3, "Original Distributions")
- numeric_X_train = transform_data(numeric_X_train)
- #plot_hists(numeric_X_train, 7, 3, "Modified Distributions")
- sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=20, max_depth=4),
- k_features=10, forward=False, verbose=2, scoring='f1', cv=3)
- sfs = sfs.fit(numeric_X_train.values, numeric_Y_train)
- best_k_numeric = [all_numeric_features[feature_index] for feature_index in sfs.k_feature_idx_]
- print('\nSequential Forward Selection Result:')
- print(best_k_numeric)
- print('CV Score:', sfs.k_score_)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement