Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 65,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "# code is written for readibility not performance\n",
- "def sort_print_importances(importances):\n",
- " imps_df = pd.DataFrame(list(importances.values()), list(importances.keys()))\n",
- " imps_df.columns = [\"Importance\"]\n",
- " return imps_df.sort_values(\"Importance\", ascending = False)\n",
- " \n",
- "def get_auc(true_labels, predictions, positive_label):\n",
- " fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_labels, predictions, pos_label = positive_label)\n",
- " return sklearn.metrics.auc(fpr, tpr)\n",
- " \n",
- "def reduce_importances(dummy_importances, original_columns):\n",
- " \"\"\" Helper function to combine the importances obtained from 1-hot encoded data.\"\"\"\n",
- " \n",
- " # d2 loop optimize for wider data\n",
- " # loop through all the importances and aggregate for original column\n",
- " importances = {}\n",
- " for col in original_columns: \n",
- " importances[col] = 0\n",
- " for dummy_col, val in dummy_importances.items():\n",
- " if str(dummy_col).startswith(col + \"_\") or dummy_col == col:\n",
- " importances[col] = importances[col] + val\n",
- " return importances\n",
- "\n",
- "# sorts by first passed in..\n",
- "def plot_importances(importances, names):\n",
- " assert(len(importances) >= 1)\n",
- " assert(len(importances) == len(names))\n",
- " import plotly.graph_objs as go\n",
- " cols = sorted(importances[0], key = importances[0].get, reverse = True)\n",
- " fig = go.FigureWidget()\n",
- " fig.layout.title = \"Importances\"\n",
- " fig.layout.showlegend = True\n",
- " for i in range(len(importances)):\n",
- " fig.add_bar(y=[importances[i][col] for col in cols], x = cols, name= names[i])\n",
- " return fig"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Train df size is 21815, test df size is 10746\n"
- ]
- }
- ],
- "source": [
- "target_column = \"yearly-income\"\n",
- "positive_label = \" >50K\"\n",
- "# Make dummy values for train a Gradient Boosted Tree\n",
- "categorical_cols = [col for col in df.columns if col != target_column and df.dtypes[col] == object]\n",
- "categorical_cols.append('education-num')\n",
- "dummy_df = pd.get_dummies(df, columns = categorical_cols)\n",
- "# Split into train and test\n",
- "train_df, test_df = train_test_split(dummy_df, test_size=0.33, random_state=42)\n",
- "print(\"Train df size is {}, test df size is {}\".format(len(train_df), len(test_df)))\n",
- "columns = list(train_df.columns)\n",
- "columns.remove(target_column)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "# Do a basic grid search over based parameters with early stopping\n",
- "gbm = sklearn.model_selection.GridSearchCV(\n",
- " ensemble.GradientBoostingClassifier(\n",
- " n_estimators=500, \n",
- " n_iter_no_change = 20, \n",
- " validation_fraction = 0.2), {\n",
- " \"learning_rate\": np.arange(0.05,0.35,0.05),\n",
- " \"max_depth\":[1,2,3,4,5]},\n",
- " cv = 5)\n",
- "gbm.fit(train_df.drop([target_column], axis = 1), \n",
- " train_df[target_column])\n",
- "tuned_gbm = gbm"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Best parameters found {'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.05, 'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_iter_no_change': 20, 'presort': 'auto', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.2, 'verbose': 0, 'warm_start': False}\n",
- "\n",
- "Stopped improving after: 500\n",
- "\n",
- "AUC on test is: 92.55 %\n"
- ]
- }
- ],
- "source": [
- "print(\"Best parameters found\", gbm.best_estimator_.get_params())\n",
- "print(\"\\nStopped improving after:\", gbm.best_estimator_.n_estimators_)\n",
- "model = tuned_gbm.best_estimator_\n",
- "# predict and score on test dataset\n",
- "predictions = model.predict_proba(test_df.drop(target_column, axis = 1))[:,list(model.classes_).index(positive_label)]\n",
- "score = get_auc(test_df[target_column], predictions, positive_label)\n",
- "print(\"\\nAUC on test is: \", round(score * 100, 2), \"%\") "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Importance</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>marital-status_ Married-civ-spouse</th>\n",
- " <td>0.362117</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>capital-gain</th>\n",
- " <td>0.213057</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>capital-loss</th>\n",
- " <td>0.072038</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>age</th>\n",
- " <td>0.063056</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>hours-per-week</th>\n",
- " <td>0.041892</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Prof-specialty</th>\n",
- " <td>0.034868</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Exec-managerial</th>\n",
- " <td>0.031057</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>fnlwgt</th>\n",
- " <td>0.024566</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education-num_13</th>\n",
- " <td>0.017286</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education-num_14</th>\n",
- " <td>0.012526</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education_ Bachelors</th>\n",
- " <td>0.008913</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>workclass_ Self-emp-not-inc</th>\n",
- " <td>0.008253</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education-num_16</th>\n",
- " <td>0.006599</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Tech-support</th>\n",
- " <td>0.006491</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education_ Prof-school</th>\n",
- " <td>0.005582</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education-num_9</th>\n",
- " <td>0.004995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>relationship_ Wife</th>\n",
- " <td>0.004767</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Other-service</th>\n",
- " <td>0.004670</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Farming-fishing</th>\n",
- " <td>0.004615</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation_ Sales</th>\n",
- " <td>0.004305</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " Importance\n",
- "marital-status_ Married-civ-spouse 0.362117\n",
- "capital-gain 0.213057\n",
- "capital-loss 0.072038\n",
- "age 0.063056\n",
- "hours-per-week 0.041892\n",
- "occupation_ Prof-specialty 0.034868\n",
- "occupation_ Exec-managerial 0.031057\n",
- "fnlwgt 0.024566\n",
- "education-num_13 0.017286\n",
- "education-num_14 0.012526\n",
- "education_ Bachelors 0.008913\n",
- "workclass_ Self-emp-not-inc 0.008253\n",
- "education-num_16 0.006599\n",
- "occupation_ Tech-support 0.006491\n",
- "education_ Prof-school 0.005582\n",
- "education-num_9 0.004995\n",
- "relationship_ Wife 0.004767\n",
- "occupation_ Other-service 0.004670\n",
- "occupation_ Farming-fishing 0.004615\n",
- "occupation_ Sales 0.004305"
- ]
- },
- "execution_count": 67,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Print raw values for the importances\n",
- "dummy_importances_1 = {}\n",
- "for i in range(len(columns)):\n",
- " dummy_importances_1[columns[i]] = tuned_gbm.best_estimator_.feature_importances_[i]\n",
- "sort_print_importances(dummy_importances_1).head(20)\n",
- " \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "plot_importances([dummy_importances_1],[\"Scikit Importances (Dummy)\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Importance</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>marital-status</th>\n",
- " <td>0.364811</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>capital-gain</th>\n",
- " <td>0.213057</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>occupation</th>\n",
- " <td>0.094029</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>capital-loss</th>\n",
- " <td>0.072038</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>age</th>\n",
- " <td>0.063056</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education-num</th>\n",
- " <td>0.051340</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>hours-per-week</th>\n",
- " <td>0.041892</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>education</th>\n",
- " <td>0.032389</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>fnlwgt</th>\n",
- " <td>0.024566</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>workclass</th>\n",
- " <td>0.016797</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>native-country</th>\n",
- " <td>0.011466</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>relationship</th>\n",
- " <td>0.010971</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>race</th>\n",
- " <td>0.001832</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>sex</th>\n",
- " <td>0.001756</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " Importance\n",
- "marital-status 0.364811\n",
- "capital-gain 0.213057\n",
- "occupation 0.094029\n",
- "capital-loss 0.072038\n",
- "age 0.063056\n",
- "education-num 0.051340\n",
- "hours-per-week 0.041892\n",
- "education 0.032389\n",
- "fnlwgt 0.024566\n",
- "workclass 0.016797\n",
- "native-country 0.011466\n",
- "relationship 0.010971\n",
- "race 0.001832\n",
- "sex 0.001756"
- ]
- },
- "execution_count": 70,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Reduce the importances to the original columns by simple adding up\n",
- "assert(len(columns) == len(tuned_gbm.best_estimator_.feature_importances_))\n",
- "original_columns = list(df.columns)\n",
- "original_columns.remove(target_column)\n",
- "reduced_importances_1 = reduce_importances(dummy_importances_1, original_columns)\n",
- "sort_print_importances(reduced_importances_1)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "plot_importances([reduced_importances_1], [\"Reduced Importances\"])"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement