Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python libraries\n",
    "import pandas as pd \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly\n",
    "import plotly.graph_objs as go\n",
    "import imblearn.under_sampling as imb # to downsample the datasaet \n",
    "from sklearn.model_selection import train_test_split # to create train and test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('2016_cens_locals_plantabaixa.csv', header = 0, index_col = None, sep = ',', low_memory=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering & Data Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 78033 entries, 0 to 78032\n",
      "Data columns (total 34 columns):\n",
      "ID_BCN            78033 non-null int64\n",
      "ID_PRINCIP        78033 non-null int64\n",
      "N_PRINCIP         78033 non-null object\n",
      "ID_SECTOR         78033 non-null int64\n",
      "N_SECTOR          78033 non-null object\n",
      "ID_GRUPACT        78033 non-null int64\n",
      "N_GRUPACT         78033 non-null object\n",
      "ID_ACT            78033 non-null int64\n",
      "N_ACT             78033 non-null object\n",
      "N_LOCAL           78033 non-null object\n",
      "SN_CARRER         78033 non-null int64\n",
      "SN_MERCAT         78033 non-null int64\n",
      "ID_MERCAT         78033 non-null int64\n",
      "N_MERCAT          78033 non-null object\n",
      "SN_GALERIA        78033 non-null int64\n",
      "N_GALERIA         78033 non-null object\n",
      "SN_CCOMERC        78033 non-null int64\n",
      "ID_CCOMERC        78033 non-null object\n",
      "N_CCOMERC         78033 non-null object\n",
      "N_CARRER          78033 non-null object\n",
      "NUM_POLICI        78017 non-null float64\n",
      "REF_CAD           78033 non-null object\n",
      "DATA              78033 non-null object\n",
      "Codi_Barri        78033 non-null int64\n",
      "Nom_Barri         78033 non-null object\n",
      "Codi_Districte    78033 non-null int64\n",
      "N_DISTRI          78033 non-null object\n",
      "N_EIX             26218 non-null object\n",
      "SN_EIX            78033 non-null int64\n",
      "SEC_CENS          78033 non-null int64\n",
      "Y_UTM_ETRS        78033 non-null float64\n",
      "X_UTM_ETRS        78033 non-null float64\n",
      "LATITUD           78033 non-null float64\n",
      "LONGITUD          78033 non-null float64\n",
      "dtypes: float64(5), int64(14), object(15)\n",
      "memory usage: 20.2+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We want to predict 'ID_PRINCIP' (1: is active, 2: no information available, 0: is not active) based on the actual information available. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Actiu               60265\n",
       "Sense informació    13023\n",
       "Sense activitat      4745\n",
       "Name: N_PRINCIP, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['N_PRINCIP'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We removed the observationes labeled as 'Sense informació' (without information)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[data['N_PRINCIP'] != 'Sense informació']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_data = data.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We decided to drop the 'SN_EIX', 'SN_CCOMERC', 'SN_GALERIA', 'SN_CARRER' and 'SN_MERCAT' features, because these column are a binary feature that indicate when N_EIX, ID_CCOMERC, N_GALERIA, N_CARRER and N_MERCAT respectively are NaN or not. \n",
    "\n",
    "We drop 'ID_BCN' because is unique for every observation, it doesn't give us relevant information.\n",
    "\n",
    "Y_UTM_ETRS and X_UTM_ETRS has the same information as LATITUD and LONGITUD, we keep only the last two.\n",
    "\n",
    "For the following features we decide to keep the ID version (numerical map of the categorical feature):\n",
    "- N_PRINCIP\n",
    "- N_SECTOR\n",
    "- N_GRUPACT\n",
    "- N_ACT\n",
    "- N_DISTRI\n",
    "- Nom_Barri\n",
    "- N_MERCAT\n",
    "- ID_CCOMERC\n",
    "\n",
    "'N_GALERIA' is empty, we dropped this feature too."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop(['ID_BCN', 'SN_EIX', 'SN_GALERIA', 'SN_CCOMERC', 'SN_CARRER', 'SN_MERCAT', 'N_PRINCIP', 'N_SECTOR', 'N_GRUPACT', 'N_ACT', 'N_DISTRI', 'Nom_Barri', 'N_MERCAT', \n",
    "                  'ID_CCOMERC', 'Y_UTM_ETRS', 'X_UTM_ETRS', 'N_GALERIA'], axis = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The initial dataset has 78033 rows and 34 columns (features). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We transform the 'DATA' feature from string to datetime. Therefore, we keep the year and month values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['DATA'] = pd.to_datetime(data['DATA'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['YEAR'] = data['DATA'].dt.year \n",
    "data = data.drop('DATA', axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2016    64740\n",
       "2017      257\n",
       "1899       12\n",
       "2001        1\n",
       "Name: YEAR, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['YEAR'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The date isn't relevant, because is not the date of when the local starts is just when was checked the activity and it has some erros (Hotel W Barcelona, Desigual and World Trade Center appears checked on 1899, when the 3 of them were inagurated many decades after). We drop the DATA feature from the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.drop('YEAR', axis = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After we remove the features with repeated or useless information we end up with 16 features, including the response."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID_PRINCIP</th>\n",
       "      <th>ID_SECTOR</th>\n",
       "      <th>ID_GRUPACT</th>\n",
       "      <th>ID_ACT</th>\n",
       "      <th>N_LOCAL</th>\n",
       "      <th>ID_MERCAT</th>\n",
       "      <th>N_CCOMERC</th>\n",
       "      <th>N_CARRER</th>\n",
       "      <th>NUM_POLICI</th>\n",
       "      <th>REF_CAD</th>\n",
       "      <th>Codi_Barri</th>\n",
       "      <th>Codi_Districte</th>\n",
       "      <th>N_EIX</th>\n",
       "      <th>SEC_CENS</th>\n",
       "      <th>LATITUD</th>\n",
       "      <th>LONGITUD</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4003</td>\n",
       "      <td>INSIDE</td>\n",
       "      <td>0</td>\n",
       "      <td>Maremàgnum</td>\n",
       "      <td>MOLL ESPANYA</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1813205DF3811D</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1030</td>\n",
       "      <td>41.375377</td>\n",
       "      <td>2.182944</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>14000</td>\n",
       "      <td>KURTZ & GUT</td>\n",
       "      <td>0</td>\n",
       "      <td>Maremàgnum</td>\n",
       "      <td>MOLL ESPANYA</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1813205DF3811D</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1030</td>\n",
       "      <td>41.375390</td>\n",
       "      <td>2.183092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4003</td>\n",
       "      <td>SPRINGFIELD</td>\n",
       "      <td>0</td>\n",
       "      <td>Maremàgnum</td>\n",
       "      <td>MOLL ESPANYA</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1813205DF3811D</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1030</td>\n",
       "      <td>41.375404</td>\n",
       "      <td>2.183217</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4000</td>\n",
       "      <td>CASAS KIDS</td>\n",
       "      <td>0</td>\n",
       "      <td>Maremàgnum</td>\n",
       "      <td>MOLL ESPANYA</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1813205DF3811D</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1030</td>\n",
       "      <td>41.375420</td>\n",
       "      <td>2.183354</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>14000</td>\n",
       "      <td>BARITIMO LOUGE CLUB</td>\n",
       "      <td>0</td>\n",
       "      <td>Maremàgnum</td>\n",
       "      <td>MOLL ESPANYA</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1813205DF3811D</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1030</td>\n",
       "      <td>41.375425</td>\n",
       "      <td>2.183517</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID_PRINCIP  ID_SECTOR  ID_GRUPACT  ID_ACT              N_LOCAL  ID_MERCAT  \\\n",
       "0           1          1           4    4003               INSIDE          0   \n",
       "1           1          2          14   14000          KURTZ & GUT          0   \n",
       "2           1          1           4    4003          SPRINGFIELD          0   \n",
       "3           1          1           4    4000           CASAS KIDS          0   \n",
       "4           1          2          14   14000  BARITIMO LOUGE CLUB          0   \n",
       "\n",
       "    N_CCOMERC      N_CARRER  NUM_POLICI         REF_CAD  Codi_Barri  \\\n",
       "0  Maremàgnum  MOLL ESPANYA         5.0  1813205DF3811D           2   \n",
       "1  Maremàgnum  MOLL ESPANYA         5.0  1813205DF3811D           2   \n",
       "2  Maremàgnum  MOLL ESPANYA         5.0  1813205DF3811D           2   \n",
       "3  Maremàgnum  MOLL ESPANYA         5.0  1813205DF3811D           2   \n",
       "4  Maremàgnum  MOLL ESPANYA         5.0  1813205DF3811D           2   \n",
       "\n",
       "   Codi_Districte N_EIX  SEC_CENS    LATITUD  LONGITUD  \n",
       "0               1   NaN      1030  41.375377  2.182944  \n",
       "1               1   NaN      1030  41.375390  2.183092  \n",
       "2               1   NaN      1030  41.375404  2.183217  \n",
       "3               1   NaN      1030  41.375420  2.183354  \n",
       "4               1   NaN      1030  41.375425  2.183517  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 65010 entries, 0 to 78031\n",
      "Data columns (total 16 columns):\n",
      "ID_PRINCIP        65010 non-null int64\n",
      "ID_SECTOR         65010 non-null int64\n",
      "ID_GRUPACT        65010 non-null int64\n",
      "ID_ACT            65010 non-null int64\n",
      "N_LOCAL           65010 non-null object\n",
      "ID_MERCAT         65010 non-null int64\n",
      "N_CCOMERC         65010 non-null object\n",
      "N_CARRER          65010 non-null object\n",
      "NUM_POLICI        64996 non-null float64\n",
      "REF_CAD           65010 non-null object\n",
      "Codi_Barri        65010 non-null int64\n",
      "Codi_Districte    65010 non-null int64\n",
      "N_EIX             22844 non-null object\n",
      "SEC_CENS          65010 non-null int64\n",
      "LATITUD           65010 non-null float64\n",
      "LONGITUD          65010 non-null float64\n",
      "dtypes: float64(3), int64(8), object(5)\n",
      "memory usage: 8.4+ MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    60265\n",
       "0     4745\n",
       "Name: ID_PRINCIP, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['ID_PRINCIP'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[['ID_PRINCIP','N_LOCAL', 'N_CCOMERC', 'N_CARRER', 'REF_CAD','N_EIX']] = data[['ID_PRINCIP','N_LOCAL', 'N_CCOMERC', 'N_CARRER', 'REF_CAD','N_EIX']].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in ['ID_PRINCIP','N_LOCAL', 'N_CCOMERC', 'N_CARRER', 'REF_CAD','N_EIX']:\n",
    "    data[col] = data[col].cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 65010 entries, 0 to 78031\n",
      "Data columns (total 16 columns):\n",
      "ID_PRINCIP        65010 non-null int8\n",
      "ID_SECTOR         65010 non-null int64\n",
      "ID_GRUPACT        65010 non-null int64\n",
      "ID_ACT            65010 non-null int64\n",
      "N_LOCAL           65010 non-null int32\n",
      "ID_MERCAT         65010 non-null int64\n",
      "N_CCOMERC         65010 non-null int8\n",
      "N_CARRER          65010 non-null int16\n",
      "NUM_POLICI        64996 non-null float64\n",
      "REF_CAD           65010 non-null int32\n",
      "Codi_Barri        65010 non-null int64\n",
      "Codi_Districte    65010 non-null int64\n",
      "N_EIX             65010 non-null int8\n",
      "SEC_CENS          65010 non-null int64\n",
      "LATITUD           65010 non-null float64\n",
      "LONGITUD          65010 non-null float64\n",
      "dtypes: float64(3), int16(1), int32(2), int64(7), int8(3)\n",
      "memory usage: 6.3 MB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "647.7310180664062px",
    "left": "1298.6956787109375px",
    "right": "20px",
    "top": "120px",
    "width": "350px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}