Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from statsmodels.stats.outliers_influence import variance_inflation_factor
- vif_threshold = 10
- all_feat = lr_model_feats
- max_vif_value = np.Inf
- max_vif_feature = None
- iter_num = 0
- while max_vif_value > vif_threshold:
- iter_num += 1
- t_start = - time.time()
- print('-' * 100)
- print(max_vif_value, max_vif_feature, len(all_feat))
- if max_vif_feature is not None:
- all_feat = [t for t in all_feat if t != max_vif_feature]
- vif = pd.DataFrame()
- X = calc_df[all_feat]
- vif["vif_value"] = [variance_inflation_factor(X.fillna(0).values, i) for i in range(X.shape[1])]
- vif['feature_name'] = X.columns
- max_vif_value = vif.vif_value.max()
- max_vif_feature = vif.sort_values("vif_value", ascending=False).head(1).feature_name.values[0]
- t_cost = time.time() + t_start
- print("---- iter %s: cost %s" % (iter_num, t_cost))
- print("done!")
- print(max_vif_value, max_vif_feature)
- print(len(all_feat))
- print(all_feat)
Add Comment
Please, Sign In to add comment