Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# code is written for readibility not performance\n",
    "def sort_print_importances(importances):\n",
    "    imps_df = pd.DataFrame(list(importances.values()), list(importances.keys()))\n",
    "    imps_df.columns = [\"Importance\"]\n",
    "    return imps_df.sort_values(\"Importance\", ascending = False)\n",
    "    \n",
    "def get_auc(true_labels, predictions, positive_label):\n",
    "    fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_labels, predictions, pos_label = positive_label)\n",
    "    return sklearn.metrics.auc(fpr, tpr)\n",
    "    \n",
    "def reduce_importances(dummy_importances, original_columns):\n",
    "    \"\"\" Helper function to combine the importances obtained from 1-hot encoded data.\"\"\"\n",
    "    \n",
    "    # d2 loop optimize for wider data\n",
    "    # loop through all the importances and aggregate for original column\n",
    "    importances = {}\n",
    "    for col in original_columns: \n",
    "        importances[col] = 0\n",
    "        for dummy_col, val in dummy_importances.items():\n",
    "            if str(dummy_col).startswith(col + \"_\") or dummy_col == col:\n",
    "                importances[col] = importances[col] + val\n",
    "    return importances\n",
    "\n",
    "# sorts by first passed in..\n",
    "def plot_importances(importances, names):\n",
    "    assert(len(importances) >= 1)\n",
    "    assert(len(importances) == len(names))\n",
    "    import plotly.graph_objs as go\n",
    "    cols = sorted(importances[0], key = importances[0].get, reverse = True)\n",
    "    fig = go.FigureWidget()\n",
    "    fig.layout.title = \"Importances\"\n",
    "    fig.layout.showlegend = True\n",
    "    for i in range(len(importances)):\n",
    "        fig.add_bar(y=[importances[i][col] for col in cols], x = cols, name= names[i])\n",
    "    return fig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train df size is 21815, test df size is 10746\n"
     ]
    }
   ],
   "source": [
    "target_column = \"yearly-income\"\n",
    "positive_label = \" >50K\"\n",
    "# Make dummy values for train a Gradient Boosted Tree\n",
    "categorical_cols = [col for col in df.columns if col != target_column and df.dtypes[col] == object]\n",
    "categorical_cols.append('education-num')\n",
    "dummy_df = pd.get_dummies(df, columns = categorical_cols)\n",
    "# Split into train and test\n",
    "train_df, test_df = train_test_split(dummy_df, test_size=0.33, random_state=42)\n",
    "print(\"Train df size is {}, test df size is {}\".format(len(train_df), len(test_df)))\n",
    "columns = list(train_df.columns)\n",
    "columns.remove(target_column)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Do a basic grid search over based parameters with early stopping\n",
    "gbm = sklearn.model_selection.GridSearchCV(\n",
    "    ensemble.GradientBoostingClassifier(\n",
    "        n_estimators=500, \n",
    "        n_iter_no_change = 20, \n",
    "        validation_fraction = 0.2), {\n",
    "    \"learning_rate\": np.arange(0.05,0.35,0.05),\n",
    "    \"max_depth\":[1,2,3,4,5]},\n",
    "    cv = 5)\n",
    "gbm.fit(train_df.drop([target_column], axis = 1), \n",
    "        train_df[target_column])\n",
    "tuned_gbm = gbm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters found {'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.05, 'loss': 'deviance', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_iter_no_change': 20, 'presort': 'auto', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.2, 'verbose': 0, 'warm_start': False}\n",
      "\n",
      "Stopped improving after: 500\n",
      "\n",
      "AUC on test is:  92.55 %\n"
     ]
    }
   ],
   "source": [
    "print(\"Best parameters found\", gbm.best_estimator_.get_params())\n",
    "print(\"\\nStopped improving after:\", gbm.best_estimator_.n_estimators_)\n",
    "model = tuned_gbm.best_estimator_\n",
    "# predict and score on test dataset\n",
    "predictions = model.predict_proba(test_df.drop(target_column, axis = 1))[:,list(model.classes_).index(positive_label)]\n",
    "score = get_auc(test_df[target_column], predictions, positive_label)\n",
    "print(\"\\nAUC on test is: \", round(score * 100, 2), \"%\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>marital-status_ Married-civ-spouse</th>\n",
       "      <td>0.362117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-gain</th>\n",
       "      <td>0.213057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-loss</th>\n",
       "      <td>0.072038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>0.063056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hours-per-week</th>\n",
       "      <td>0.041892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Prof-specialty</th>\n",
       "      <td>0.034868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Exec-managerial</th>\n",
       "      <td>0.031057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fnlwgt</th>\n",
       "      <td>0.024566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education-num_13</th>\n",
       "      <td>0.017286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education-num_14</th>\n",
       "      <td>0.012526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education_ Bachelors</th>\n",
       "      <td>0.008913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>workclass_ Self-emp-not-inc</th>\n",
       "      <td>0.008253</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education-num_16</th>\n",
       "      <td>0.006599</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Tech-support</th>\n",
       "      <td>0.006491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education_ Prof-school</th>\n",
       "      <td>0.005582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education-num_9</th>\n",
       "      <td>0.004995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>relationship_ Wife</th>\n",
       "      <td>0.004767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Other-service</th>\n",
       "      <td>0.004670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Farming-fishing</th>\n",
       "      <td>0.004615</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation_ Sales</th>\n",
       "      <td>0.004305</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    Importance\n",
       "marital-status_ Married-civ-spouse    0.362117\n",
       "capital-gain                          0.213057\n",
       "capital-loss                          0.072038\n",
       "age                                   0.063056\n",
       "hours-per-week                        0.041892\n",
       "occupation_ Prof-specialty            0.034868\n",
       "occupation_ Exec-managerial           0.031057\n",
       "fnlwgt                                0.024566\n",
       "education-num_13                      0.017286\n",
       "education-num_14                      0.012526\n",
       "education_ Bachelors                  0.008913\n",
       "workclass_ Self-emp-not-inc           0.008253\n",
       "education-num_16                      0.006599\n",
       "occupation_ Tech-support              0.006491\n",
       "education_ Prof-school                0.005582\n",
       "education-num_9                       0.004995\n",
       "relationship_ Wife                    0.004767\n",
       "occupation_ Other-service             0.004670\n",
       "occupation_ Farming-fishing           0.004615\n",
       "occupation_ Sales                     0.004305"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Print raw values for the importances\n",
    "dummy_importances_1 = {}\n",
    "for i in range(len(columns)):\n",
    "    dummy_importances_1[columns[i]] = tuned_gbm.best_estimator_.feature_importances_[i]\n",
    "sort_print_importances(dummy_importances_1).head(20)\n",
    "     \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "plot_importances([dummy_importances_1],[\"Scikit Importances (Dummy)\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>marital-status</th>\n",
       "      <td>0.364811</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-gain</th>\n",
       "      <td>0.213057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupation</th>\n",
       "      <td>0.094029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-loss</th>\n",
       "      <td>0.072038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>0.063056</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education-num</th>\n",
       "      <td>0.051340</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hours-per-week</th>\n",
       "      <td>0.041892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>education</th>\n",
       "      <td>0.032389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fnlwgt</th>\n",
       "      <td>0.024566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>workclass</th>\n",
       "      <td>0.016797</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>native-country</th>\n",
       "      <td>0.011466</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>relationship</th>\n",
       "      <td>0.010971</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>race</th>\n",
       "      <td>0.001832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sex</th>\n",
       "      <td>0.001756</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                Importance\n",
       "marital-status    0.364811\n",
       "capital-gain      0.213057\n",
       "occupation        0.094029\n",
       "capital-loss      0.072038\n",
       "age               0.063056\n",
       "education-num     0.051340\n",
       "hours-per-week    0.041892\n",
       "education         0.032389\n",
       "fnlwgt            0.024566\n",
       "workclass         0.016797\n",
       "native-country    0.011466\n",
       "relationship      0.010971\n",
       "race              0.001832\n",
       "sex               0.001756"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Reduce the importances to the original columns by simple adding up\n",
    "assert(len(columns) == len(tuned_gbm.best_estimator_.feature_importances_))\n",
    "original_columns = list(df.columns)\n",
    "original_columns.remove(target_column)\n",
    "reduced_importances_1 = reduce_importances(dummy_importances_1, original_columns)\n",
    "sort_print_importances(reduced_importances_1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "plot_importances([reduced_importances_1], [\"Reduced Importances\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}