SHARE
TWEET

Untitled

a guest Oct 21st, 2019 73 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. {
  2.  "cells": [
  3.   {
  4.    "cell_type": "code",
  5.    "execution_count": 2,
  6.    "metadata": {},
  7.    "outputs": [
  8.     {
  9.      "data": {
  10.       "text/html": [
  11.        "<div>\n",
  12.        "<style scoped>\n",
  13.        "    .dataframe tbody tr th:only-of-type {\n",
  14.        "        vertical-align: middle;\n",
  15.        "    }\n",
  16.        "\n",
  17.        "    .dataframe tbody tr th {\n",
  18.        "        vertical-align: top;\n",
  19.        "    }\n",
  20.        "\n",
  21.        "    .dataframe thead th {\n",
  22.        "        text-align: right;\n",
  23.        "    }\n",
  24.        "</style>\n",
  25.        "<table border=\"1\" class=\"dataframe\">\n",
  26.        "  <thead>\n",
  27.        "    <tr style=\"text-align: right;\">\n",
  28.        "      <th></th>\n",
  29.        "      <th>category</th>\n",
  30.        "      <th>jobtitle</th>\n",
  31.        "    </tr>\n",
  32.        "  </thead>\n",
  33.        "  <tbody>\n",
  34.        "    <tr>\n",
  35.        "      <th>0</th>\n",
  36.        "      <td>education</td>\n",
  37.        "      <td>After School Supervisor</td>\n",
  38.        "    </tr>\n",
  39.        "    <tr>\n",
  40.        "      <th>1</th>\n",
  41.        "      <td>education</td>\n",
  42.        "      <td>*****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...</td>\n",
  43.        "    </tr>\n",
  44.        "    <tr>\n",
  45.        "      <th>2</th>\n",
  46.        "      <td>education</td>\n",
  47.        "      <td>Bay Area Family Recruiter</td>\n",
  48.        "    </tr>\n",
  49.        "    <tr>\n",
  50.        "      <th>3</th>\n",
  51.        "      <td>education</td>\n",
  52.        "      <td>Adult Day Programs/Community Access/Job Coaches</td>\n",
  53.        "    </tr>\n",
  54.        "    <tr>\n",
  55.        "      <th>4</th>\n",
  56.        "      <td>education</td>\n",
  57.        "      <td>General Counselor - Non Tenure track</td>\n",
  58.        "    </tr>\n",
  59.        "  </tbody>\n",
  60.        "</table>\n",
  61.        "</div>"
  62.       ],
  63.       "text/plain": [
  64.        "    category                                           jobtitle\n",
  65.        "0  education                            After School Supervisor\n",
  66.        "1  education  *****TUTORS NEEDED - FOR ALL SUBJECTS, ALL AGE...\n",
  67.        "2  education                          Bay Area Family Recruiter\n",
  68.        "3  education    Adult Day Programs/Community Access/Job Coaches\n",
  69.        "4  education               General Counselor - Non Tenure track"
  70.       ]
  71.      },
  72.      "execution_count": 2,
  73.      "metadata": {},
  74.      "output_type": "execute_result"
  75.     }
  76.    ],
  77.    "source": [
  78.     "import pandas as pd\n",
  79.     "df = pd.read_csv(r'craigslistJobTitles.csv',encoding = \"ISO-8859-1\")\n",
  80.     "df.head()"
  81.    ]
  82.   },
  83.   {
  84.    "cell_type": "code",
  85.    "execution_count": 3,
  86.    "metadata": {},
  87.    "outputs": [],
  88.    "source": [
  89.     "from io import StringIO\n",
  90.     "\n",
  91.     "df['category_id'] = df['category'].factorize()[0]\n",
  92.     "category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')\n",
  93.     "category_to_id = dict(category_id_df.values)\n",
  94.     "id_to_category = dict(category_id_df[['category_id', 'category']].values)"
  95.    ]
  96.   },
  97.   {
  98.    "cell_type": "code",
  99.    "execution_count": 5,
  100.    "metadata": {},
  101.    "outputs": [
  102.     {
  103.      "data": {
  104.       "image/png": "\n",
  105.       "text/plain": [
  106.        "<Figure size 576x432 with 1 Axes>"
  107.       ]
  108.      },
  109.      "metadata": {
  110.       "needs_background": "light"
  111.      },
  112.      "output_type": "display_data"
  113.     }
  114.    ],
  115.    "source": [
  116.     "import matplotlib.pyplot as plt\n",
  117.     "fig = plt.figure(figsize=(8,6))\n",
  118.     "df.groupby('category').jobtitle.count().plot.bar(ylim=0,color='g')\n",
  119.     "plt.ylabel('count')\n",
  120.     "plt.show()"
  121.    ]
  122.   },
  123.   {
  124.    "cell_type": "code",
  125.    "execution_count": 7,
  126.    "metadata": {},
  127.    "outputs": [
  128.     {
  129.      "data": {
  130.       "text/plain": [
  131.        "13845"
  132.       ]
  133.      },
  134.      "execution_count": 7,
  135.      "metadata": {},
  136.      "output_type": "execute_result"
  137.     }
  138.    ],
  139.    "source": [
  140.     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  141.     "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')\n",
  142.     "len(df)"
  143.    ]
  144.   },
  145.   {
  146.    "cell_type": "code",
  147.    "execution_count": 8,
  148.    "metadata": {},
  149.    "outputs": [
  150.     {
  151.      "data": {
  152.       "text/plain": [
  153.        "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
  154.        "        dtype=<class 'numpy.float64'>, encoding='latin-1', input='content',\n",
  155.        "        lowercase=True, max_df=1.0, max_features=None, min_df=5,\n",
  156.        "        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,\n",
  157.        "        stop_words='english', strip_accents=None, sublinear_tf=True,\n",
  158.        "        token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
  159.        "        vocabulary=None)"
  160.       ]
  161.      },
  162.      "execution_count": 8,
  163.      "metadata": {},
  164.      "output_type": "execute_result"
  165.     }
  166.    ],
  167.    "source": [
  168.     "tfidf"
  169.    ]
  170.   },
  171.   {
  172.    "cell_type": "code",
  173.    "execution_count": 10,
  174.    "metadata": {},
  175.    "outputs": [
  176.     {
  177.      "data": {
  178.       "text/plain": [
  179.        "(13845, 3212)"
  180.       ]
  181.      },
  182.      "execution_count": 10,
  183.      "metadata": {},
  184.      "output_type": "execute_result"
  185.     }
  186.    ],
  187.    "source": [
  188.     "features = tfidf.fit_transform(df.jobtitle).toarray()\n",
  189.     "labels = df.category_id\n",
  190.     "features.shape"
  191.    ]
  192.   },
  193.   {
  194.    "cell_type": "code",
  195.    "execution_count": 12,
  196.    "metadata": {},
  197.    "outputs": [
  198.     {
  199.      "data": {
  200.       "text/plain": [
  201.        "array([[0., 0., 0., ..., 0., 0., 0.],\n",
  202.        "       [0., 0., 0., ..., 0., 0., 0.],\n",
  203.        "       [0., 0., 0., ..., 0., 0., 0.],\n",
  204.        "       ...,\n",
  205.        "       [0., 0., 0., ..., 0., 0., 0.],\n",
  206.        "       [0., 0., 0., ..., 0., 0., 0.],\n",
  207.        "       [0., 0., 0., ..., 0., 0., 0.]])"
  208.       ]
  209.      },
  210.      "execution_count": 12,
  211.      "metadata": {},
  212.      "output_type": "execute_result"
  213.     }
  214.    ],
  215.    "source": [
  216.     "features"
  217.    ]
  218.   },
  219.   {
  220.    "cell_type": "code",
  221.    "execution_count": 15,
  222.    "metadata": {},
  223.    "outputs": [],
  224.    "source": [
  225.     "from sklearn.model_selection import train_test_split\n",
  226.     "from sklearn.feature_extraction.text import CountVectorizer\n",
  227.     "from sklearn.feature_extraction.text import TfidfTransformer\n",
  228.     "from sklearn.naive_bayes import MultinomialNB\n",
  229.     "from sklearn.linear_model import LogisticRegression\n",
  230.     "from sklearn.svm import LinearSVC\n",
  231.     "from sklearn.model_selection import cross_val_score\n",
  232.     "from sklearn.tree import DecisionTreeClassifier\n",
  233.     "from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier"
  234.    ]
  235.   },
  236.   {
  237.    "cell_type": "code",
  238.    "execution_count": 49,
  239.    "metadata": {},
  240.    "outputs": [
  241.     {
  242.      "name": "stderr",
  243.      "output_type": "stream",
  244.      "text": [
  245.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  246.       "  FutureWarning)\n",
  247.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  248.       "  \"this warning.\", FutureWarning)\n",
  249.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  250.       "  FutureWarning)\n",
  251.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  252.       "  \"this warning.\", FutureWarning)\n",
  253.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  254.       "  FutureWarning)\n",
  255.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  256.       "  \"this warning.\", FutureWarning)\n",
  257.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  258.       "  FutureWarning)\n",
  259.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  260.       "  \"this warning.\", FutureWarning)\n",
  261.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  262.       "  FutureWarning)\n",
  263.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  264.       "  \"this warning.\", FutureWarning)\n"
  265.      ]
  266.     },
  267.     {
  268.      "data": {
  269.       "image/png": "\n",
  270.       "text/plain": [
  271.        "<Figure size 432x288 with 1 Axes>"
  272.       ]
  273.      },
  274.      "metadata": {
  275.       "needs_background": "light"
  276.      },
  277.      "output_type": "display_data"
  278.     }
  279.    ],
  280.    "source": [
  281.     "models = [\n",
  282.     "    LogisticRegression(random_state=0),\n",
  283.     "    RandomForestClassifier(n_estimators=100, max_depth=2, random_state=11),\n",
  284.     "    LinearSVC(),\n",
  285.     "    MultinomialNB(),\n",
  286.     "    DecisionTreeClassifier(),\n",
  287.     "    AdaBoostClassifier(n_estimators=20),\n",
  288.     "    GradientBoostingClassifier(n_estimators=20)\n",
  289.     "]\n",
  290.     "CV = 5\n",
  291.     "cv_df = pd.DataFrame(index=range(CV * len(models)))\n",
  292.     "entries = []\n",
  293.     "for model in models:\n",
  294.     "  model_name = model.__class__.__name__\n",
  295.     "  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)\n",
  296.     "  for fold_idx, accuracy in enumerate(accuracies):\n",
  297.     "    entries.append((model_name, fold_idx, accuracy))\n",
  298.     "    \n",
  299.     "cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])\n",
  300.     "\n",
  301.     "import seaborn as sns\n",
  302.     "g = sns.boxplot(x='model_name', y='accuracy', data=cv_df)\n",
  303.     "\n",
  304.     "g.set_xticklabels(g.get_xticklabels(),rotation=60)\n",
  305.     "#sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=5, linewidth=1)\n",
  306.     "plt.xlabel('Model')\n",
  307.     "plt.ylabel('Accuracy')\n",
  308.     "plt.ylim(0,1)\n",
  309.     "plt.show()"
  310.    ]
  311.   },
  312.   {
  313.    "cell_type": "code",
  314.    "execution_count": 50,
  315.    "metadata": {},
  316.    "outputs": [
  317.     {
  318.      "data": {
  319.       "text/plain": [
  320.        "model_name\n",
  321.        "AdaBoostClassifier            0.522643\n",
  322.        "DecisionTreeClassifier        0.782883\n",
  323.        "GradientBoostingClassifier    0.677645\n",
  324.        "LinearSVC                     0.822031\n",
  325.        "LogisticRegression            0.824415\n",
  326.        "MultinomialNB                 0.811991\n",
  327.        "RandomForestClassifier        0.551028\n",
  328.        "Name: accuracy, dtype: float64"
  329.       ]
  330.      },
  331.      "execution_count": 50,
  332.      "metadata": {},
  333.      "output_type": "execute_result"
  334.     }
  335.    ],
  336.    "source": [
  337.     "cv_df.groupby('model_name').accuracy.mean()"
  338.    ]
  339.   },
  340.   {
  341.    "cell_type": "code",
  342.    "execution_count": 23,
  343.    "metadata": {},
  344.    "outputs": [
  345.     {
  346.      "data": {
  347.       "text/plain": [
  348.        "(0, 1)"
  349.       ]
  350.      },
  351.      "execution_count": 23,
  352.      "metadata": {},
  353.      "output_type": "execute_result"
  354.     },
  355.     {
  356.      "data": {
  357.       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHklJREFUeJzt3Xt0FeW5x/HvQxCQuwiIcilQoYqXozYFq22pCkqxglVQFBHwQm3FeqkKiFBF67VUVLyhBzm6RC1WbTwqqEUKVuFAvVZERVSIoIAioBRCkvf88QQTMSSbsHdm75nfZ62std/Zw87DLPLLyzszz1gIARERib86URcgIiK1Q4EvIpIQCnwRkYRQ4IuIJIQCX0QkIRT4IiIJUW3gm9lUM1ttZv/ewftmZreZ2VIze9PMDkt/mSIisqtSmeFPA/pU8f4vgC5lXyOAu3a9LBERSbdqAz+EMBf4oopd+gMPBDcfaG5me6erQBERSY+6afiMtsCKCuPCsm2rtt/RzEbg/wugUaNGP9xvv/3S8O1FRGLuq69glUfqvzZsWBtCaFWTj0lH4Fsl2yrt1xBCmAJMAcjPzw+LFi1Kw7cXEYmpdetg4kSYORO6dIFu3bAHH/y4ph+Xjqt0CoH2FcbtgJVp+FwRkWQKAZ57DgYO9LCvXx8uvhimTdulj01H4BcAZ5ZdrXM4sD6E8J3lHBERSdGLL8IVV8CXX0J+Pjz6KAweDHV2LbKrXdIxs4eBnwMtzawQ+AOwG0AI4W7gGaAvsBTYBAzfpYpERJKuZ0/o0QN694b+/cEqWznfedUGfgjhtGreD8D5aalGRCSJCgth0iQYNQpatYK8PJg8OW1Bv006TtqKiEhNlJbC9Olw112wZQs0bgxXXeXvpTnsQYEvIhKNDz6Aq6+GxYt9/ItfwEUXZfRbKvBFRGpTUZFfbTN1KhQXQ+vWfoL2Jz/J+LdW4IuI1KYPP4T77vPlnAED4IILoFGjWvnWCnwRkUzbuhV2281f/+AHcOGFsP/+cFjt9ppUe2QRkUxauNBn8i+9VL5t8OBaD3vQDF9EJDM2boRbb4Unn/TxjBm1sk5fFQW+iEi6zZ0L118Pa9b4Us4558DQoVFXpcAXEUmbDRvghhu8Dw7AQQfBuHHQuXO0dZVR4IuIpEteHrz+OjRoAOefD6eeusv9b9JJgS8isis++wyaNfOQb9TIZ/gtW8I++0Rd2Xdkz68eEZFcUloKf/2rtzC+++7y7QcfnJVhD5rhi4jsvOXL4dpr4dVXfbxqlf8CyKLlm8oo8EVEUlVSAg895DP6oiJo0cI7XB59dEaanaWbAl9EJBVffw3nnQfvvOPj44+HSy7x9fscocAXEUlFo0bQpo0/Z/aKK+CII6KuaKcp8EVEduStt6BhQ/j+9308dizUq+fbcpACX0Rke//5D9x5JzzyiDc5u/9+v8a+efOoK9slCnwRkYr+7//8CpyVK/2qmx49/GRtXl7Ule0yBb6ICHhbhEmToKDAx127wvjxsN9+0daVRgp8EZGSEhg2zK+vr1cPzj0XhgyBuvGKyHj9bUREaiIvDwYNgpkzfVbfsWPUFWWEAl9EkicEePZZn9mfcIJvGzDAv7L8btldocAXkWRZtcp71b/8sl9e+eMfe7OzGAf9Ngp8EUmG0lJ47DGYPBk2bYKmTf1O2T33jLqyWqPAF5H4+/hjuOYa71UPcMwxcPnliQp7UOCLSBJMmABvvOHNzkaP9mZnCaTAF5F4CqG8g+WoUX7X7EUX+VJOQsX/LIWIJEtRkbdFGDu2fNu2m6gSHPagGb6IxMkbb/jyzccf++x+2DAPewEU+CISB5s2+dU3M2b4Uk7Hjj6jV9h/iwJfRHLbK6/AH/8In37qd8wOHQrnnOMtEuRbFPgiktteecXDfr/9NKuvhgJfRHLPunWwxx7++je/gXbt4OSTY9HCOJN0lY6I5I61a/2GqSFDfN0eYPfd4ZRTFPYpSCnwzayPmb1rZkvNbHQl73cwsxfN7DUze9PM+qa/VBFJrBDgqadg4ECYPdt71y9ZEnVVOafaJR0zywPuAHoDhcBCMysIISyusNuVwF9CCHeZWTfgGaBjBuoVkaRZudJPyi5Y4OMjjvCHiLdpE21dOSiVNfzuwNIQwjIAM3sE6A9UDPwAbLujoRmwMp1FikhCPf003HCDP2O2aVO49FL4xS/K76CVnZJK4LcFVlQYFwI9ttvnKuA5M7sAaAT0quyDzGwEMAKgQ4cOO1uriCRNkyYe9sce62HfokXUFeW0VNbwK/tVGrYbnwZMCyG0A/oCD5rZdz47hDAlhJAfQshv1arVzlcrIvFWXAwLF5aPf/YzeOABuO46hX0apBL4hUD7CuN2fHfJ5mzgLwAhhFeABkDLdBQoIgmxZAmceSacfz4srrBi3K1bdDXFTCqBvxDoYmadzKweMAgo2G6f5cAxAGa2Px74a9JZqIjE1JYtcPvtHvbvvecnY4uLo64qlqpdww8hFJvZSGAWkAdMDSG8bWYTgEUhhALg98C9ZnYxvtwzLISw/bKPiMi3vfaaP5hk+XI/EXv66X4j1e67R11ZLKV0p20I4Rn8UsuK28ZXeL0YODK9pYlIrD35JFx7rb/u3BnGjYODDoq2pphTawURicaRR0Lz5n4z1fDhanZWCxT4IlI71q+HRx/1TpZ16kCrVlBQAA0bRl1ZYijwRSSzQoAXXoCbbvKmZ40aweDB/p7CvlYp8EUkc9asgRtvhDlzfHzYYX5tvURCgS8i6ReCL9fccgt89ZXP5C+6CE480ZdzJBIKfBFJv7//3S+3BPjJT7zZWevW0dYkCnwRyYCjj/alm+OO8z44anaWFfR/KxHZdcuWeUuE1at9XKcO/PnPHvgK+6yhwBeRmtu6Fe67z++QXbAA7ror6oqkClrSEZGaWbwYJkyApUt9fNJJ8LvfRVuTVEmBLyI7Z/NmuOceeOghKC31B4hfeSXk50ddmVRDgS8iO2f5cg978IeJ//rX0KBBtDVJShT4IlK9LVugfn1/3bWrP33qgAP8S3KGTtqKSNVeegl+9Sv4xz/Kt51yisI+B2mGLyKVW7cOJk6EmTN9/NRT0LNntDXJLlHgi8i3hQDPP+/Nzr780pdyzj8fBg2KujLZRQp8ESm3bp23RJg718c/+pFfgdO2bbR1SVoo8EWkXP368P770LgxXHwx9OunO2VjRIEvknQrVsCee3pHy4YNvZ1xq1b+JbGiq3REkqq0FB58EE49Fe68s3x7t24K+5jSDF8kiT74AK6+2tsjAGzc6L8A1Ks+1hT4IkmydSvcfz9MnQrFxd6jfuxYf6C4xJ4CXyQpvvoKzjrLWxkDDBgAF1zgz5iVRFDgiyRF48bw/e/7zP7KK/35spIoCnyROFu4EJo18/43AGPG+KWX2/riSKIo8EXiaONGuPVWePJJD/sHHoC6daFp06grkwgp8EXiZu5cuP56WLMGdtsNevWKuiLJEgp8kbj44gv405/gued8fPDBMG4cdOoUbV2SNRT4InFQUgLDh8Mnn/jDSEaO9BbGuq5eKlDgi8RBXh4MHQovvOBX4OyzT9QVSRZS4IvkotJSeOIJb2x20km+7Ve/8i81O5MdUOCL5Jrly+Haa+HVV335pmdPb36moJdqKPBFckVJiT88/O67oagIWrSA0aM97EVSoMAXyQXvvQcTJsCSJT7+5S/hkkt0Xb3slJRO4ZtZHzN718yWmtnoHexzipktNrO3zWx6essUSbAQvEf9kiXQpg3cfjtcdZXCXnZatTN8M8sD7gB6A4XAQjMrCCEsrrBPF2AMcGQIYZ2Ztc5UwSKJsa1dsRlccQU8/rg/W7Zhw6grkxyVygy/O7A0hLAshFAEPAL0326fc4E7QgjrAEIIq9NbpkiCbNoEEyfCqFE+uwdvenbZZQp72SWprOG3BVZUGBcCPbbbpyuAmf0TyAOuCiHM3P6DzGwEMAKgQ4cONalXJN4WLIA//hFWrvTZ/QcfwL77Rl2VxEQqgV/ZtV6hks/pAvwcaAfMM7MDQwhffusPhTAFmAKQn5+//WeIJNeGDTBpEhQU+LhrV/jDHxT2klapBH4h0L7CuB2wspJ95ocQtgIfmtm7+C+AhWmpUiTO5szxZmeffw716sGIEXDGGd7dUiSNUlnDXwh0MbNOZlYPGAQUbLfPk8BRAGbWEl/iWZbOQkVi6803PewPOQQefhiGDVPYS0ZU+68qhFBsZiOBWfj6/NQQwttmNgFYFEIoKHvvWDNbDJQAl4UQPs9k4SI5KwRvXdy67GK2ESOgY0e/tl7NziSDLIRoltLz8/PDokWLIvneIpFZtQquu85Pxv7lL/7YQZGdYGb/CiHk1+TPajohUhtKSz3gTzkFXnkFNm8uf5i4SC3RQqFIpn38MVxzDbz+uo+POcavsW/RItq6JHEU+CKZ9Pjj/hSqoiJvcjZqFBx9dNRVSUIp8EUyqU0bD/t+/eCii9T/RiKlwBdJp6IiWLgQjjzSx0ccAY8+6q0RRCKmk7Yi6fLGG3DaaT6Tf+ut8u0Ke8kSmuGL7KpNm2DyZJgxw6+x79jRnzErkmUU+CK74pVXvNnZp596yA8bBmef7S0SRLKMAl+kph57DG64wV/vvz+MG+dNz0SylNbwRWrqqKOgVSv43e9g2jSFvWQ9zfBFUrV2LUyf7k+dysvz6+r/9jct30jOUOCLVCcEeOopuOUW2LgRmjeHM8/09xT2kkMU+CJVWbnST8ouWODjI46A446LtiaRGlLgi1RmW7OzyZO90VmzZnDppdCnjz9UXCQHKfBFKvPCC94DB+DYYz3s1exMcpwCX6QyvXrBiy/6jL5nz6irEUkLXZYpAvDOO37D1KpVPq5Tx58zq7CXGFHgS7Jt2QK33QZDh3ovnHvvjboikYzRko4k16uvwrXXwvLlPqMfPBjOOy/qqkQyRoEvyfP113D77d4aAaBzZxg/Hg48MNq6RDJMgS/Js3IlPPEE1K0LZ50Fw4fDbrtFXZVIxinwJRm+/hoaNfLXXbrAFVfAAQfAvvtGW5dILdJJW4m3EOC55+DEE2H27PLt/fsr7CVxNMOX+Fqzxi+tnDvXxy+8oAeIS6Ip8CV+QvAulpMmwVdf+VLOhRf6LF8kwRT4Ei+ffw5XXukPEgf46U9hzBho3TraukSygAJf4qVRI78Kp3lzuOwy74OjZmcigAJf4mDZMthrLw/7Bg3g5pv9SVR77BF1ZSJZRVfpSO7autVbIZx+ut9ItU3Xrgp7kUpohi+5afFimDABli71cQjew76O5jAiO6LAl9yyeTPccw889JAHfLt2MG4c/PCHUVcmkvUU+JI7Nm6EIUOgsNBn8kOGwK9/7ev2IlItBb7kjiZNvMFZgwbe7Kxbt6grEskpCnzJbvPmQcuWsP/+Ph4zBurVU7MzkRpQ4Et2WrfOnyk7a5b3vHnwQQ/5bQ3QRGSnpXRJg5n1MbN3zWypmY2uYr8BZhbMLD99JUqihAAzZ8KAAR72DRpAv36Qlxd1ZSI5r9oZvpnlAXcAvYFCYKGZFYQQFm+3XxPgd8CCTBQqCbB6tTc7mzfPx927w9ix0LZttHWJxEQqSzrdgaUhhGUAZvYI0B9YvN1+1wA3AZemtUJJhuJifxjJp59C48Zw8cU+s1dbBJG0SWVJpy2wosK4sGzbN8zsUKB9COF/q/ogMxthZovMbNGaNWt2uliJsbp14dxzoWdPmDHD+9Ur7EXSKpUZfmU/deGbN83qALcAw6r7oBDCFGAKQH5+fqhmd4mzkhKYPh3q14dTTvFt/fppVi+SQakEfiHQvsK4HbCywrgJcCAwx/wHtQ1QYGb9QgiL0lWoxMj778M113h7hPr1oVcvaNFCQS+SYakE/kKgi5l1Aj4BBgGnb3szhLAeaLltbGZzgEsV9vIdRUVw//0wdarP8Pfay0/KtmgRdWUiiVBt4IcQis1sJDALyAOmhhDeNrMJwKIQQkGmi5QYeOstn9UvW+bjgQNh5EhdVy9Si1K68SqE8AzwzHbbxu9g35/velkSKyHArbd62Hfo4M3ODj006qpEEkd32krmFBf71TdmvnTz9NN+JU79+lFXJpJICnxJv40b/QHin38Ot9zigd+pky/hiEhkFPiSXv/4h98tu3at97758EPo3DnqqkQEBb6kyxdf+LNkn3/exwcf7Gv1nTpFW5eIfEOBL7tu5ky46SbYsAF2392XbgYO1OMGRbKMAl923bJlHvY9evjJ2X32iboiEamEAl92XmkprFpV3sXynHO8Z33v3rpbViSL6f/csnOWL/fnyJ51ls/qwZ9AdeyxCnuRLKfAl9SUlMADD8CgQfDaa34z1YoV1f85EckaWtKR6r33HkyYAEuW+PiEE7xffdOm0dYlIjtFgS9Ve+QRv3mqpAT23ttPyh5+eNRViUgNKPClap07+0naU0+F88+Hhg2jrkhEakiBL9+2aRPMnw9HH+3j7t3h8cehffuq/5yIZD2dtJVy8+f7TH7UKHj99fLtCnuRWNAMX/zyyltugaee8vEPfqA+9SIxpMBPutmz4cYbvbNlvXowYgSccYa3NRaRWNFPdZI9/DBMnOivDznEm51973vR1iQiGaPAT7LjjvPQHzIETj5Zzc5EYk4/4UmycqV3tSwu9nGLFn4FjjpbiiSCZvhJUFoKM2bA5Mnwn//AXnvB0KH+ntbqRRJDP+1x99FHcM018MYbPu7Vy1sjiEjiKPDjqrjYm53dey9s3Qp77gmjR8NRR0VdmYhERIEfV7Nnw513+uv+/eHCC9XsTCThFPhxEkJ5T/peveDll6FvX2+PICKJp0sz4uL11/2GqU8+8XGdOnDVVQp7EfmGAj/Xbdrkl1qecw68+y5MmxZ1RSKSpbSkk8tefhmuuw4+/RTy8mD4cH/0oIhIJRT4uWjDBm+J8PTTPt5/fxg/Hrp0ibYuEclqCvxctHYtzJrlzc7OOw8GD/YZvohIFRT4uWL9er+s0syfQjV+PBx4IHToEHVlIpIjdNI224UABQVw4onw/PPl2/v2VdiLyE5R4GezlSv9ObITJsDGjfDPf0ZdkYjkMC3pZKPSUnj0UbjjDti8GZo1g0svhT59oq5MRHKYAj/brF7tPW/efNPHxx7rYd+iRbR1iUjOS2lJx8z6mNm7ZrbUzEZX8v4lZrbYzN40s7+bmR6bVFPNmsGXX0KrVvDnP/t19gp7EUmDamf4ZpYH3AH0BgqBhWZWEEJYXGG314D8EMImM/sNcBNwaiYKjqV33oF27aBJE6hfH/70Jw/8Jk2irkxEYiSVGX53YGkIYVkIoQh4BOhfcYcQwoshhE1lw/lAu/SWGVNbtsBtt/nDSG67rXx7584KexFJu1TW8NsCKyqMC4EeVex/NvBsZW+Y2QhgBECHpF9S+Oqr/mCSFSu80VnDht/udikikmapBH5lCRQq3dHsDCAf6FnZ+yGEKcAUgPz8/Eo/I/a+/tpn83/9q48r3kQlIpJBqQR+IdC+wrgdsHL7ncysFzAW6BlC2JKe8mJmwwY47TT47DN/luxZZ3nDs912i7oyEUmAVAJ/IdDFzDoBnwCDgNMr7mBmhwL3AH1CCKvTXmVcNG0KP/oRLFvms/p99426IhFJkGoDP4RQbGYjgVlAHjA1hPC2mU0AFoUQCoCbgcbADPM16OUhhH4ZrDs3hODtEPbeGw46yLeNGuVX4tTRTc4iUrtSuvEqhPAM8Mx228ZXeN0rzXXlvtWr4YYbYO5c6NQJHnrIu1vuvnvUlYlIQulO23QLAZ58EiZN8hO0jRr5un1dHWoRiZZSKJ0KC+Haa2HRIh//9KcwZgy0bh1tXSIiKPDTp7gYRozwpZzmzeHyy6F3b11XLyJZQ4GfLnXreivj+fPh97/30BcRySIK/JrauhXuv9/X6AcP9m3HH+9fIiJZSIFfE2+/7Q8l+eADv/Kmb1/YY4+oqxIRqZICf2ds3gx33w3Tp/tDStq3hyuvVNiLSE5Q4Kdq0SJvdvbJJ37T1Jln+knaBg2irkxEJCUK/FSEAFOmeNjvu6+3RejWLeqqRER2igK/KkVFvkZv5ks3zz/vM3s1OxORHKTAr8y6df7UqS+/hMmTPfA7dICzz466MhGRGlPgVxQCzJoFN98M69f7+vxHH3kvHBGRHKfA3+azz+D66+Gll3zcvTuMHQtt20Zbl4hImijwAf72N5g4ETZtgsaN4ZJL4IQT1BZBRGJFgQ8+u9+0CXr2hNGjoVWrqCsSEUm7ZAZ+SYk/PLxjRx8PHw777efdLTWrF5GYSt5jl95/3wN+xAg/MQt+meXPfqawF5FYS84Mv6gIpk71hmclJbDXXrBqFTRrFnVlIiK1IhmB/9Zb3hZh2TIfDxwII0d6p0sRkYSIf+A/8ADcfrtfY9+hA4wbB4ceGnVVIiK1Lv6Bf8AB3uxsyBA491yoXz/qikREIhG/wN+4EebN8x71AD/8IRQU+Jq9iEiCxSvw58yBG26AtWuhTRs47DDfrrAXEYlJ4H/xBdx0E7zwgo8PPhhatIi2JhGRLJPbgR8CPPusd7bcsAF2392vvhk40NftRUTkG7kd+A89BJMm+esePbzZ2T77RFuTiEiWyu3A/+Uv4Ykn/M7Z44/XnbIiIlXIrXWPjz+GCRNg61YfN28OM2Z48CvsRUSqlBsz/JISePBBf65sUZHfQDVsmL+ntXoRkZRkf+C/957P6pcs8fEJJ8BJJ0Vbk4hIDsrewC8qgvvug2nToLQU9t7bT8oefnjUlYmI5KTsDfw5c7y7pRkMGgS//S00bBh1VSIiOSu7Ar+0tHxNvndv+Ne/vEXCf/1XtHWJiMRA9pzxnD8fTj0Vli/3sRmMGaOwFxFJk+gDf8MGuPpqv0P2ww9h+vSoKxIRiaWUAt/M+pjZu2a21MxGV/J+fTN7tOz9BWbWMaXvPns2DBgATz0F9erBBRfAZZft1F9ARERSU+0avpnlAXcAvYFCYKGZFYQQFlfY7WxgXQhhXzMbBNwInFrlBxcWwuWX++tDDvEHk3zvezX6S4iISPVSOWnbHVgaQlgGYGaPAP2BioHfH7iq7PVjwGQzsxBC2OGnfvWVX3VzwQVw8sm6gUpEJMNSCfy2wIoK40Kgx472CSEUm9l6YE9gbcWdzGwEMKJsuMXmzfs38+bVpO64acl2xyrBdCzK6ViU07Eo94Oa/sFUAr+yJjXbz9xT2YcQwhRgCoCZLQoh5Kfw/WNPx6KcjkU5HYtyOhblzGxRTf9sKusohUD7CuN2wMod7WNmdYFmwBc1LUpERNIvlcBfCHQxs05mVg8YBBRst08BMLTs9QBgdpXr9yIiUuuqXdIpW5MfCcwC8oCpIYS3zWwCsCiEUAD8N/CgmS3FZ/aDUvjeU3ah7rjRsSinY1FOx6KcjkW5Gh8L00RcRCQZdC2kiEhCKPBFRBIi44GfsbYMOSiFY3GJmS02szfN7O9mFttbj6s7FhX2G2Bmwcxie0leKsfCzE4p+7fxtpnFtuFUCj8jHczsRTN7reznpG8UdWaamU01s9Vm9u8dvG9mdlvZcXrTzA5L6YNDCBn7wk/yfgB0BuoBbwDdttvnt8DdZa8HAY9msqaovlI8FkcBDcte/ybJx6JsvybAXGA+kB913RH+u+gCvAbsUTZuHXXdER6LKcBvyl53Az6Kuu4MHYufAYcB/97B+32BZ/F7oA4HFqTyuZme4X/TliGEUARsa8tQUX/gf8pePwYcYxbLJ5JXeyxCCC+GEDaVDefj9zzEUSr/LgCuAW4CNtdmcbUslWNxLnBHCGEdQAhhdS3XWFtSORYBaFr2uhnfvScoFkIIc6n6Xqb+wAPBzQeam9ne1X1upgO/srYMbXe0TwihGNjWliFuUjkWFZ2N/waPo2qPhZkdCrQPIfxvbRYWgVT+XXQFuprZP81svpn1qbXqalcqx+Iq4AwzKwSeAS6ondKyzs7mCZD5J16lrS1DDKT89zSzM4B8oGdGK4pOlcfCzOoAtwDDaqugCKXy76Iuvqzzc/x/ffPM7MAQwpcZrq22pXIsTgOmhRAmmtmP8ft/DgwhlGa+vKxSo9zM9AxfbRnKpXIsMLNewFigXwhhSy3VVtuqOxZNgAOBOWb2Eb5GWRDTE7ep/oz8LYSwNYTwIfAu/gsgblI5FmcDfwEIIbwCNMAbqyVNSnmyvUwHvtoylKv2WJQtY9yDh31c12mhmmMRQlgfQmgZQugYQuiIn8/oF0KocdOoLJbKz8iT+Al9zKwlvsSzrFarrB2pHIvlwDEAZrY/HvhrarXK7FAAnFl2tc7hwPoQwqrq/lBGl3RC5toy5JwUj8XNQGNgRtl56+UhhH6RFZ0hKR6LREjxWMwCjjWzxUAJcFkI4fPoqs6MFI/F74F7zexifAljWBwniGb2ML6E17LsfMUfgN0AQgh34+cv+gJLgU3A8JQ+N4bHSkREKqE7bUVEEkKBLyKSEAp8EZGEUOCLiCSEAl9EJCEU+CIiCaHAFxFJiP8HQndPnM5lh+YAAAAASUVORK5CYII=\n",
  358.       "text/plain": [
  359.        "<Figure size 432x288 with 1 Axes>"
  360.       ]
  361.      },
  362.      "metadata": {
  363.       "needs_background": "light"
  364.      },
  365.      "output_type": "display_data"
  366.     }
  367.    ],
  368.    "source": [
  369.     "plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',\n",
  370.     "         label='Chance', alpha=.8)\n",
  371.     "plt.xlim(0,1)\n",
  372.     "plt.ylim(0,1)"
  373.    ]
  374.   },
  375.   {
  376.    "cell_type": "code",
  377.    "execution_count": null,
  378.    "metadata": {},
  379.    "outputs": [],
  380.    "source": [
  381.     "plt.plot(cv_df['fold_idx'] , cv_df['accuracy'])"
  382.    ]
  383.   },
  384.   {
  385.    "cell_type": "code",
  386.    "execution_count": 81,
  387.    "metadata": {},
  388.    "outputs": [
  389.     {
  390.      "name": "stderr",
  391.      "output_type": "stream",
  392.      "text": [
  393.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  394.       "  FutureWarning)\n",
  395.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  396.       "  \"this warning.\", FutureWarning)\n"
  397.      ]
  398.     }
  399.    ],
  400.    "source": [
  401.     "model = LogisticRegression()\n",
  402.     "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)\n",
  403.     "model.fit(X_train, y_train)\n",
  404.     "y_pred = model.predict(X_test)"
  405.    ]
  406.   },
  407.   {
  408.    "cell_type": "code",
  409.    "execution_count": 82,
  410.    "metadata": {},
  411.    "outputs": [
  412.     {
  413.      "data": {
  414.       "image/png": "\n",
  415.       "text/plain": [
  416.        "<Figure size 360x360 with 2 Axes>"
  417.       ]
  418.      },
  419.      "metadata": {
  420.       "needs_background": "light"
  421.      },
  422.      "output_type": "display_data"
  423.     }
  424.    ],
  425.    "source": [
  426.     "from sklearn.metrics import confusion_matrix\n",
  427.     "\n",
  428.     "conf_mat = confusion_matrix(y_test, y_pred)\n",
  429.     "fig, ax = plt.subplots(figsize=(5,5))\n",
  430.     "sns.heatmap(conf_mat, cmap=\"Greens\", annot=True, fmt='d', xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)\n",
  431.     "plt.ylabel('Actual')\n",
  432.     "plt.xlabel('Predicted')\n",
  433.     "#ax.set_ylim()\n",
  434.     "plt.show()"
  435.    ]
  436.   },
  437.   {
  438.    "cell_type": "code",
  439.    "execution_count": 56,
  440.    "metadata": {},
  441.    "outputs": [
  442.     {
  443.      "data": {
  444.       "text/plain": [
  445.        "array([[711,   4,   4,   4,  21,  17],\n",
  446.        "       [  6, 652,  84,   2,  20,  42],\n",
  447.        "       [  8,  35, 685,   3,  12,  77],\n",
  448.        "       [  4,   9,   7, 432,  52,  33],\n",
  449.        "       [ 11,  20,  22,  33, 702,  83],\n",
  450.        "       [ 23,  46,  61,   8,  68, 568]], dtype=int64)"
  451.       ]
  452.      },
  453.      "execution_count": 56,
  454.      "metadata": {},
  455.      "output_type": "execute_result"
  456.     }
  457.    ],
  458.    "source": []
  459.   },
  460.   {
  461.    "cell_type": "code",
  462.    "execution_count": 44,
  463.    "metadata": {},
  464.    "outputs": [
  465.     {
  466.      "ename": "ValueError",
  467.      "evalue": "multiclass format is not supported",
  468.      "output_type": "error",
  469.      "traceback": [
  470.       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
  471.       "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
  472.       "\u001b[1;32m<ipython-input-44-a1b4c2fca2f1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[0mprobs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mpreds\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mprobs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mfpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mthreshold\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mroc_curve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpreds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      6\u001b[0m \u001b[0mroc_auc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mauc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfpr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtpr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
  473.       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\ranking.py\u001b[0m in \u001b[0;36mroc_curve\u001b[1;34m(y_true, y_score, pos_label, sample_weight, drop_intermediate)\u001b[0m\n\u001b[0;32m    616\u001b[0m     \"\"\"\n\u001b[0;32m    617\u001b[0m     fps, tps, thresholds = _binary_clf_curve(\n\u001b[1;32m--> 618\u001b[1;33m         y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m    619\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    620\u001b[0m     \u001b[1;31m# Attempt to drop thresholds corresponding to points in between and\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  474.       "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\ranking.py\u001b[0m in \u001b[0;36m_binary_clf_curve\u001b[1;34m(y_true, y_score, pos_label, sample_weight)\u001b[0m\n\u001b[0;32m    395\u001b[0m     if not (y_type == \"binary\" or\n\u001b[0;32m    396\u001b[0m             (y_type == \"multiclass\" and pos_label is not None)):\n\u001b[1;32m--> 397\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"{0} format is not supported\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_type\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    398\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    399\u001b[0m     \u001b[0mcheck_consistent_length\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_score\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  475.       "\u001b[1;31mValueError\u001b[0m: multiclass format is not supported"
  476.      ]
  477.     }
  478.    ],
  479.    "source": [
  480.     "import sklearn.metrics as metrics\n",
  481.     "# calculate the fpr and tpr for all thresholds of the classification\n",
  482.     "probs = model.predict_proba(X_test)\n",
  483.     "preds = probs[:,1]\n",
  484.     "fpr, tpr, threshold = metrics.roc_curve(y_test, preds)\n",
  485.     "roc_auc = metrics.auc(fpr, tpr)\n",
  486.     "\n",
  487.     "# method I: plt\n",
  488.     "plt.title('Receiver Operating Characteristic')\n",
  489.     "plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\n",
  490.     "plt.legend(loc = 'lower right')\n",
  491.     "plt.plot([0, 1], [0, 1],'r--')\n",
  492.     "plt.xlim([0, 1])\n",
  493.     "plt.ylim([0, 1])\n",
  494.     "plt.ylabel('True Positive Rate')\n",
  495.     "plt.xlabel('False Positive Rate')\n",
  496.     "plt.show()"
  497.    ]
  498.   },
  499.   {
  500.    "cell_type": "code",
  501.    "execution_count": 46,
  502.    "metadata": {},
  503.    "outputs": [
  504.     {
  505.      "data": {
  506.       "text/plain": [
  507.        "0        0\n",
  508.        "1        0\n",
  509.        "2        0\n",
  510.        "3        0\n",
  511.        "4        0\n",
  512.        "5        0\n",
  513.        "6        0\n",
  514.        "7        0\n",
  515.        "8        0\n",
  516.        "9        0\n",
  517.        "10       0\n",
  518.        "11       0\n",
  519.        "12       0\n",
  520.        "13       0\n",
  521.        "14       0\n",
  522.        "15       0\n",
  523.        "16       0\n",
  524.        "17       0\n",
  525.        "18       0\n",
  526.        "19       0\n",
  527.        "20       0\n",
  528.        "21       0\n",
  529.        "22       0\n",
  530.        "23       0\n",
  531.        "24       0\n",
  532.        "25       0\n",
  533.        "26       0\n",
  534.        "27       0\n",
  535.        "28       0\n",
  536.        "29       0\n",
  537.        "        ..\n",
  538.        "13815    5\n",
  539.        "13816    5\n",
  540.        "13817    5\n",
  541.        "13818    5\n",
  542.        "13819    5\n",
  543.        "13820    5\n",
  544.        "13821    5\n",
  545.        "13822    5\n",
  546.        "13823    5\n",
  547.        "13824    5\n",
  548.        "13825    5\n",
  549.        "13826    5\n",
  550.        "13827    5\n",
  551.        "13828    5\n",
  552.        "13829    5\n",
  553.        "13830    5\n",
  554.        "13831    5\n",
  555.        "13832    5\n",
  556.        "13833    5\n",
  557.        "13834    5\n",
  558.        "13835    5\n",
  559.        "13836    5\n",
  560.        "13837    5\n",
  561.        "13838    5\n",
  562.        "13839    5\n",
  563.        "13840    5\n",
  564.        "13841    5\n",
  565.        "13842    5\n",
  566.        "13843    5\n",
  567.        "13844    5\n",
  568.        "Name: category_id, Length: 13845, dtype: int64"
  569.       ]
  570.      },
  571.      "execution_count": 46,
  572.      "metadata": {},
  573.      "output_type": "execute_result"
  574.     }
  575.    ],
  576.    "source": [
  577.     "from itertools import cycle\n",
  578.     "from sklearn.metrics import roc_curve, auc\n",
  579.     "from sklearn.preprocessing import label_binarize\n",
  580.     "from sklearn.multiclass import OneVsRestClassifier\n",
  581.     "from scipy import interp\n",
  582.     "\n",
  583.     "labels"
  584.    ]
  585.   },
  586.   {
  587.    "cell_type": "code",
  588.    "execution_count": null,
  589.    "metadata": {},
  590.    "outputs": [],
  591.    "source": [
  592.     "# Binarize the output\n",
  593.     "y = label_binarize(y, classes=[0, 1, 2])\n",
  594.     "n_classes = y.shape[1]"
  595.    ]
  596.   },
  597.   {
  598.    "cell_type": "code",
  599.    "execution_count": 61,
  600.    "metadata": {},
  601.    "outputs": [],
  602.    "source": [
  603.     "import xgboost as xgb"
  604.    ]
  605.   },
  606.   {
  607.    "cell_type": "code",
  608.    "execution_count": 65,
  609.    "metadata": {},
  610.    "outputs": [],
  611.    "source": [
  612.     "X_train, X_test, Y_train, Y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)"
  613.    ]
  614.   },
  615.   {
  616.    "cell_type": "code",
  617.    "execution_count": 67,
  618.    "metadata": {},
  619.    "outputs": [],
  620.    "source": [
  621.     "D_train = xgb.DMatrix(X_train, label=Y_train)\n",
  622.     "D_test = xgb.DMatrix(X_test, label=Y_test)"
  623.    ]
  624.   },
  625.   {
  626.    "cell_type": "code",
  627.    "execution_count": 76,
  628.    "metadata": {},
  629.    "outputs": [],
  630.    "source": [
  631.     "param = {\n",
  632.     "    'eta': 0.3, \n",
  633.     "    'max_depth': 3,  \n",
  634.     "    'objective': 'multi:softprob',  \n",
  635.     "    'num_class': 6} \n",
  636.     "\n",
  637.     "steps = 500  # The number of training iterations"
  638.    ]
  639.   },
  640.   {
  641.    "cell_type": "code",
  642.    "execution_count": 77,
  643.    "metadata": {},
  644.    "outputs": [],
  645.    "source": [
  646.     "model = xgb.train(param, D_train, steps)"
  647.    ]
  648.   },
  649.   {
  650.    "cell_type": "code",
  651.    "execution_count": 78,
  652.    "metadata": {},
  653.    "outputs": [
  654.     {
  655.      "name": "stdout",
  656.      "output_type": "stream",
  657.      "text": [
  658.       "Precision = 0.8178374981626977\n",
  659.       "Recall = 0.8109642765523942\n",
  660.       "Accuracy = 0.8093674764718757\n"
  661.      ]
  662.     }
  663.    ],
  664.    "source": [
  665.     "import numpy as np\n",
  666.     "from sklearn.metrics import precision_score, recall_score, accuracy_score\n",
  667.     "\n",
  668.     "preds = model.predict(D_test)\n",
  669.     "best_preds = np.asarray([np.argmax(line) for line in preds])\n",
  670.     "\n",
  671.     "print(\"Precision = {}\".format(precision_score(Y_test, best_preds, average='macro')))\n",
  672.     "print(\"Recall = {}\".format(recall_score(Y_test, best_preds, average='macro')))\n",
  673.     "print(\"Accuracy = {}\".format(accuracy_score(Y_test, best_preds)))"
  674.    ]
  675.   },
  676.   {
  677.    "cell_type": "code",
  678.    "execution_count": 83,
  679.    "metadata": {},
  680.    "outputs": [
  681.     {
  682.      "name": "stderr",
  683.      "output_type": "stream",
  684.      "text": [
  685.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
  686.       "  FutureWarning)\n",
  687.       "C:\\Users\\agyumol\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
  688.       "  \"this warning.\", FutureWarning)\n"
  689.      ]
  690.     },
  691.     {
  692.      "name": "stdout",
  693.      "output_type": "stream",
  694.      "text": [
  695.       "# 'accounting':\n",
  696.       "  . Top unigrams:\n",
  697.       "       . accountant\n",
  698.       "       . accounting\n",
  699.       "  . Top bigrams:\n",
  700.       "       . accounts payable\n",
  701.       "       . success manager\n",
  702.       "# 'administrative':\n",
  703.       "  . Top unigrams:\n",
  704.       "       . office\n",
  705.       "       . admin\n",
  706.       "  . Top bigrams:\n",
  707.       "       . executive assistant\n",
  708.       "       . personal assistant\n",
  709.       "# 'customerservice':\n",
  710.       "  . Top unigrams:\n",
  711.       "       . customer\n",
  712.       "       . sales\n",
  713.       "  . Top bigrams:\n",
  714.       "       . service advisor\n",
  715.       "       . teller time\n",
  716.       "# 'education':\n",
  717.       "  . Top unigrams:\n",
  718.       "       . teacher\n",
  719.       "       . teachers\n",
  720.       "  . Top bigrams:\n",
  721.       "       . tutors needed\n",
  722.       "       . hiring time\n",
  723.       "# 'foodbeverage':\n",
  724.       "  . Top unigrams:\n",
  725.       "       . cook\n",
  726.       "       . restaurant\n",
  727.       "  . Top bigrams:\n",
  728.       "       . assistant manager\n",
  729.       "       . team members\n",
  730.       "# 'labor':\n",
  731.       "  . Top unigrams:\n",
  732.       "       . maintenance\n",
  733.       "       . warehouse\n",
  734.       "  . Top bigrams:\n",
  735.       "       . 595 week\n",
  736.       "       . make 595\n"
  737.      ]
  738.     }
  739.    ],
  740.    "source": [
  741.     "model.fit(features, labels)\n",
  742.     "N = 2\n",
  743.     "\n",
  744.     "\n",
  745.     "for category, category_id in sorted(category_to_id.items()):\n",
  746.     "  indices = np.argsort(model.coef_[category_id])\n",
  747.     "  feature_names = np.array(tfidf.get_feature_names())[indices]\n",
  748.     "  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]\n",
  749.     "  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]\n",
  750.     "  print(\"# '{}':\".format(category))\n",
  751.     "  print(\"  . Top unigrams:\\n       . {}\".format('\\n       . '.join(unigrams)))\n",
  752.     "  print(\"  . Top bigrams:\\n       . {}\".format('\\n       . '.join(bigrams)))"
  753.    ]
  754.   },
  755.   {
  756.    "cell_type": "code",
  757.    "execution_count": null,
  758.    "metadata": {},
  759.    "outputs": [],
  760.    "source": []
  761.   }
  762.  ],
  763.  "metadata": {
  764.   "kernelspec": {
  765.    "display_name": "Python 3",
  766.    "language": "python",
  767.    "name": "python3"
  768.   },
  769.   "language_info": {
  770.    "codemirror_mode": {
  771.     "name": "ipython",
  772.     "version": 3
  773.    },
  774.    "file_extension": ".py",
  775.    "mimetype": "text/x-python",
  776.    "name": "python",
  777.    "nbconvert_exporter": "python",
  778.    "pygments_lexer": "ipython3",
  779.    "version": "3.7.3"
  780.   }
  781.  },
  782.  "nbformat": 4,
  783.  "nbformat_minor": 2
  784. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top