Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Imprort libraries"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "data = pd.read_csv('gapminder.csv', low_memory=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Explore data set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>country</th>\n",
- " <th>incomeperperson</th>\n",
- " <th>alcconsumption</th>\n",
- " <th>armedforcesrate</th>\n",
- " <th>breastcancerper100th</th>\n",
- " <th>co2emissions</th>\n",
- " <th>femaleemployrate</th>\n",
- " <th>hivrate</th>\n",
- " <th>internetuserate</th>\n",
- " <th>lifeexpectancy</th>\n",
- " <th>oilperperson</th>\n",
- " <th>polityscore</th>\n",
- " <th>relectricperperson</th>\n",
- " <th>suicideper100th</th>\n",
- " <th>employrate</th>\n",
- " <th>urbanrate</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Afghanistan</td>\n",
- " <td></td>\n",
- " <td>.03</td>\n",
- " <td>.5696534</td>\n",
- " <td>26.8</td>\n",
- " <td>75944000</td>\n",
- " <td>25.6000003814697</td>\n",
- " <td></td>\n",
- " <td>3.65412162280064</td>\n",
- " <td>48.673</td>\n",
- " <td></td>\n",
- " <td>0</td>\n",
- " <td></td>\n",
- " <td>6.68438529968262</td>\n",
- " <td>55.7000007629394</td>\n",
- " <td>24.04</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Albania</td>\n",
- " <td>1914.99655094922</td>\n",
- " <td>7.29</td>\n",
- " <td>1.0247361</td>\n",
- " <td>57.4</td>\n",
- " <td>223747333.333333</td>\n",
- " <td>42.0999984741211</td>\n",
- " <td></td>\n",
- " <td>44.9899469578783</td>\n",
- " <td>76.918</td>\n",
- " <td></td>\n",
- " <td>9</td>\n",
- " <td>636.341383366604</td>\n",
- " <td>7.69932985305786</td>\n",
- " <td>51.4000015258789</td>\n",
- " <td>46.72</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Algeria</td>\n",
- " <td>2231.99333515006</td>\n",
- " <td>.69</td>\n",
- " <td>2.306817</td>\n",
- " <td>23.5</td>\n",
- " <td>2932108666.66667</td>\n",
- " <td>31.7000007629394</td>\n",
- " <td>.1</td>\n",
- " <td>12.5000733055148</td>\n",
- " <td>73.131</td>\n",
- " <td>.42009452521537</td>\n",
- " <td>2</td>\n",
- " <td>590.509814347428</td>\n",
- " <td>4.8487696647644</td>\n",
- " <td>50.5</td>\n",
- " <td>65.22</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Andorra</td>\n",
- " <td>21943.3398976022</td>\n",
- " <td>10.17</td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td>81</td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td></td>\n",
- " <td>5.36217880249023</td>\n",
- " <td></td>\n",
- " <td>88.92</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Angola</td>\n",
- " <td>1381.00426770244</td>\n",
- " <td>5.57</td>\n",
- " <td>1.4613288</td>\n",
- " <td>23.1</td>\n",
- " <td>248358000</td>\n",
- " <td>69.4000015258789</td>\n",
- " <td>2</td>\n",
- " <td>9.99995388324075</td>\n",
- " <td>51.093</td>\n",
- " <td></td>\n",
- " <td>-2</td>\n",
- " <td>172.999227388199</td>\n",
- " <td>14.5546770095825</td>\n",
- " <td>75.6999969482422</td>\n",
- " <td>56.7</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " country incomeperperson alcconsumption armedforcesrate \\\n",
- "0 Afghanistan .03 .5696534 \n",
- "1 Albania 1914.99655094922 7.29 1.0247361 \n",
- "2 Algeria 2231.99333515006 .69 2.306817 \n",
- "3 Andorra 21943.3398976022 10.17 \n",
- "4 Angola 1381.00426770244 5.57 1.4613288 \n",
- "\n",
- " breastcancerper100th co2emissions femaleemployrate hivrate \\\n",
- "0 26.8 75944000 25.6000003814697 \n",
- "1 57.4 223747333.333333 42.0999984741211 \n",
- "2 23.5 2932108666.66667 31.7000007629394 .1 \n",
- "3 \n",
- "4 23.1 248358000 69.4000015258789 2 \n",
- "\n",
- " internetuserate lifeexpectancy oilperperson polityscore \\\n",
- "0 3.65412162280064 48.673 0 \n",
- "1 44.9899469578783 76.918 9 \n",
- "2 12.5000733055148 73.131 .42009452521537 2 \n",
- "3 81 \n",
- "4 9.99995388324075 51.093 -2 \n",
- "\n",
- " relectricperperson suicideper100th employrate urbanrate \n",
- "0 6.68438529968262 55.7000007629394 24.04 \n",
- "1 636.341383366604 7.69932985305786 51.4000015258789 46.72 \n",
- "2 590.509814347428 4.8487696647644 50.5 65.22 \n",
- "3 5.36217880249023 88.92 \n",
- "4 172.999227388199 14.5546770095825 75.6999969482422 56.7 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "data.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(213, 16)\n",
- "213\n",
- "16\n"
- ]
- }
- ],
- "source": [
- "print(data.shape) # dimension of dataframe\n",
- "print (len(data)) # number of observations (rows)\n",
- "print (len(data.columns)) # number of variables (columns)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['country', 'incomeperperson', 'alcconsumption', 'armedforcesrate',\n",
- " 'breastcancerper100th', 'co2emissions', 'femaleemployrate', 'hivrate',\n",
- " 'internetuserate', 'lifeexpectancy', 'oilperperson', 'polityscore',\n",
- " 'relectricperperson', 'suicideper100th', 'employrate', 'urbanrate'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Columns name\n",
- "data.columns"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Setting variables you will be working with to numeric"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#setting variables you will be working with to numeric\n",
- "data['suicideper100th'] =pd.to_numeric(data['suicideper100th'], errors='coerce')\n",
- "data['incomeperperson'] =pd.to_numeric(data['incomeperperson'], errors='coerce')\n",
- "data['internetuserate'] =pd.to_numeric(data['internetuserate'], errors='coerce')\n",
- "data['urbanrate'] =pd.to_numeric(data['urbanrate'], errors='coerce')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "<class 'pandas.core.frame.DataFrame'>\n",
- "RangeIndex: 213 entries, 0 to 212\n",
- "Data columns (total 16 columns):\n",
- "country 213 non-null object\n",
- "incomeperperson 190 non-null float64\n",
- "alcconsumption 213 non-null object\n",
- "armedforcesrate 213 non-null object\n",
- "breastcancerper100th 213 non-null object\n",
- "co2emissions 213 non-null object\n",
- "femaleemployrate 213 non-null object\n",
- "hivrate 213 non-null object\n",
- "internetuserate 192 non-null float64\n",
- "lifeexpectancy 213 non-null object\n",
- "oilperperson 213 non-null object\n",
- "polityscore 213 non-null object\n",
- "relectricperperson 213 non-null object\n",
- "suicideper100th 191 non-null float64\n",
- "employrate 213 non-null object\n",
- "urbanrate 203 non-null float64\n",
- "dtypes: float64(4), object(12)\n",
- "memory usage: 26.7+ KB\n"
- ]
- }
- ],
- "source": [
- "# Check type of data after converting\n",
- "data.info()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Make subset of the data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "sub_data = data[['country','suicideper100th', 'incomeperperson', 'internetuserate', 'urbanrate']].copy()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>country</th>\n",
- " <th>suicideper100th</th>\n",
- " <th>incomeperperson</th>\n",
- " <th>internetuserate</th>\n",
- " <th>urbanrate</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>106</th>\n",
- " <td>Lesotho</td>\n",
- " <td>7.858619</td>\n",
- " <td>495.734247</td>\n",
- " <td>3.860565</td>\n",
- " <td>25.46</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>92</th>\n",
- " <td>Italy</td>\n",
- " <td>4.930045</td>\n",
- " <td>18982.269285</td>\n",
- " <td>53.740217</td>\n",
- " <td>68.08</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>131</th>\n",
- " <td>Mozambique</td>\n",
- " <td>10.550375</td>\n",
- " <td>389.763634</td>\n",
- " <td>4.170136</td>\n",
- " <td>36.84</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>23</th>\n",
- " <td>Bosnia and Herzegovina</td>\n",
- " <td>11.836716</td>\n",
- " <td>2183.344867</td>\n",
- " <td>52.002061</td>\n",
- " <td>47.44</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>199</th>\n",
- " <td>Uganda</td>\n",
- " <td>12.289122</td>\n",
- " <td>377.421113</td>\n",
- " <td>12.500255</td>\n",
- " <td>12.98</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>122</th>\n",
- " <td>Mauritania</td>\n",
- " <td>6.882952</td>\n",
- " <td>609.131206</td>\n",
- " <td>2.999803</td>\n",
- " <td>41.00</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>68</th>\n",
- " <td>Georgia</td>\n",
- " <td>1.574350</td>\n",
- " <td>1258.762596</td>\n",
- " <td>26.297251</td>\n",
- " <td>52.74</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>21</th>\n",
- " <td>Bhutan</td>\n",
- " <td>15.542603</td>\n",
- " <td>1324.194906</td>\n",
- " <td>13.598876</td>\n",
- " <td>34.48</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>133</th>\n",
- " <td>Namibia</td>\n",
- " <td>8.021970</td>\n",
- " <td>2667.246710</td>\n",
- " <td>6.500823</td>\n",
- " <td>36.84</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>30</th>\n",
- " <td>Cambodia</td>\n",
- " <td>4.961071</td>\n",
- " <td>557.947513</td>\n",
- " <td>1.259934</td>\n",
- " <td>21.56</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " country suicideper100th incomeperperson \\\n",
- "106 Lesotho 7.858619 495.734247 \n",
- "92 Italy 4.930045 18982.269285 \n",
- "131 Mozambique 10.550375 389.763634 \n",
- "23 Bosnia and Herzegovina 11.836716 2183.344867 \n",
- "199 Uganda 12.289122 377.421113 \n",
- "122 Mauritania 6.882952 609.131206 \n",
- "68 Georgia 1.574350 1258.762596 \n",
- "21 Bhutan 15.542603 1324.194906 \n",
- "133 Namibia 8.021970 2667.246710 \n",
- "30 Cambodia 4.961071 557.947513 \n",
- "\n",
- " internetuserate urbanrate \n",
- "106 3.860565 25.46 \n",
- "92 53.740217 68.08 \n",
- "131 4.170136 36.84 \n",
- "23 52.002061 47.44 \n",
- "199 12.500255 12.98 \n",
- "122 2.999803 41.00 \n",
- "68 26.297251 52.74 \n",
- "21 13.598876 34.48 \n",
- "133 6.500823 36.84 \n",
- "30 1.259934 21.56 "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.sample(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Check for missing values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "country 0\n",
- "suicideper100th 22\n",
- "incomeperperson 23\n",
- "internetuserate 21\n",
- "urbanrate 10\n",
- "dtype: int64"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.isnull().sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(213, 5)"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.shape"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### To group the quantities into appropriate bins"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>suicideper100th</th>\n",
- " <th>incomeperperson</th>\n",
- " <th>internetuserate</th>\n",
- " <th>urbanrate</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " <td>191.000000</td>\n",
- " <td>190.000000</td>\n",
- " <td>192.000000</td>\n",
- " <td>203.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>9.640839</td>\n",
- " <td>8740.966076</td>\n",
- " <td>35.632716</td>\n",
- " <td>56.769360</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>6.300178</td>\n",
- " <td>14262.809083</td>\n",
- " <td>27.780285</td>\n",
- " <td>23.844933</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>min</th>\n",
- " <td>0.201449</td>\n",
- " <td>103.775857</td>\n",
- " <td>0.210066</td>\n",
- " <td>10.400000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>25%</th>\n",
- " <td>4.988449</td>\n",
- " <td>748.245151</td>\n",
- " <td>9.999604</td>\n",
- " <td>36.830000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>50%</th>\n",
- " <td>8.262893</td>\n",
- " <td>2553.496056</td>\n",
- " <td>31.810121</td>\n",
- " <td>57.940000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>75%</th>\n",
- " <td>12.328551</td>\n",
- " <td>9379.891165</td>\n",
- " <td>56.416046</td>\n",
- " <td>74.210000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>max</th>\n",
- " <td>35.752872</td>\n",
- " <td>105147.437697</td>\n",
- " <td>95.638113</td>\n",
- " <td>100.000000</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " suicideper100th incomeperperson internetuserate urbanrate\n",
- "count 191.000000 190.000000 192.000000 203.000000\n",
- "mean 9.640839 8740.966076 35.632716 56.769360\n",
- "std 6.300178 14262.809083 27.780285 23.844933\n",
- "min 0.201449 103.775857 0.210066 10.400000\n",
- "25% 4.988449 748.245151 9.999604 36.830000\n",
- "50% 8.262893 2553.496056 31.810121 57.940000\n",
- "75% 12.328551 9379.891165 56.416046 74.210000\n",
- "max 35.752872 105147.437697 95.638113 100.000000"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "for col in ('suicideper100th', 'incomeperperson', 'internetuserate', 'urbanrate'): \n",
- " if col == 'suicideper100th':\n",
- " sub_data.loc[sub_data[col] <= 5, col] = 1\n",
- " sub_data.loc[( sub_data[col] > 5) & ( sub_data[col] <= 10), col] = 2\n",
- " sub_data.loc[( sub_data[col] > 10) & ( sub_data[col] <= 15), col] = 3\n",
- " sub_data.loc[( sub_data[col] > 15) & ( sub_data[col] <= 20), col] = 4\n",
- " sub_data.loc[( sub_data[col] > 20) & ( sub_data[col] <= 25), col] = 5\n",
- " sub_data.loc[( sub_data[col] > 25) & ( sub_data[col] <= 30), col] = 6\n",
- " sub_data.loc[( sub_data[col] > 30) & ( sub_data[col] <= 35), col] = 7\n",
- " sub_data.loc[( sub_data[col] > 35) & ( sub_data[col] <= 40), col] = 8\n",
- " \n",
- " \n",
- " if col == 'incomeperperson':\n",
- " sub_data.loc[sub_data[col] <= 1000, col] = 1\n",
- " sub_data.loc[( sub_data[col] > 1000) & ( sub_data[col] <= 2000), col] = 2\n",
- " sub_data.loc[( sub_data[col] > 2000) & ( sub_data[col] <= 3000), col] = 3\n",
- " sub_data.loc[( sub_data[col] > 3000) & ( sub_data[col] <= 4000), col] = 4\n",
- " sub_data.loc[( sub_data[col] > 4000) & ( sub_data[col] <= 5000), col] = 5\n",
- " sub_data.loc[( sub_data[col] > 5000) & ( sub_data[col] <= 6000), col] = 6\n",
- " sub_data.loc[( sub_data[col] > 6000) & ( sub_data[col] <= 7000), col] = 7\n",
- " sub_data.loc[( sub_data[col] > 7000) & ( sub_data[col] <= 8000), col] = 8\n",
- " sub_data.loc[( sub_data[col] > 8000) & ( sub_data[col] <= 9000), col] = 9\n",
- " sub_data.loc[( sub_data[col] > 9000) & ( sub_data[col] <= 10000), col] = 10\n",
- " sub_data.loc[( sub_data[col] > 10000) & ( sub_data[col] <= 11000), col] =11\n",
- " sub_data.loc[( sub_data[col] > 11000) & ( sub_data[col] <= 12000), col] = 12\n",
- " sub_data.loc[( sub_data[col] > 12000) & ( sub_data[col] <= 13000), col] = 13\n",
- " sub_data.loc[( sub_data[col] > 13000) & ( sub_data[col] <= 14000), col] = 14\n",
- " sub_data.loc[( sub_data[col] > 14000) & ( sub_data[col] <= 15000), col] = 15\n",
- " sub_data.loc[( sub_data[col] > 15000), col] = 16\n",
- " \n",
- " \n",
- " \n",
- " if col == 'internetuserate':\n",
- " sub_data.loc[sub_data[col] <= 10, col] = 1\n",
- " sub_data.loc[( sub_data[col] > 10) & ( sub_data[col] <= 20), col] = 2\n",
- " sub_data.loc[( sub_data[col] > 20) & ( sub_data[col] <= 30), col] = 3\n",
- " sub_data.loc[( sub_data[col] > 30) & ( sub_data[col] <= 40), col] = 4\n",
- " sub_data.loc[( sub_data[col] > 40) & ( sub_data[col] <= 50), col] = 5\n",
- " sub_data.loc[( sub_data[col] > 50) & ( sub_data[col] <= 60), col] = 6\n",
- " sub_data.loc[( sub_data[col] > 60) & ( sub_data[col] <= 70), col] = 7\n",
- " sub_data.loc[( sub_data[col] > 70) & ( sub_data[col] <= 80), col] = 8\n",
- " sub_data.loc[( sub_data[col] > 80) & ( sub_data[col] <= 90), col] = 9\n",
- " sub_data.loc[( sub_data[col] > 90) & ( sub_data[col] <= 100), col] = 10\n",
- " sub_data.loc[( sub_data[col] > 100) & ( sub_data[col] <= 110), col] =11\n",
- " sub_data.loc[( sub_data[col] > 110), col] = 12\n",
- " \n",
- " \n",
- " if col == 'urbanrate':\n",
- " sub_data.loc[sub_data[col] <= 10, col] = 1\n",
- " sub_data.loc[( sub_data[col] > 10) & ( sub_data[col] <= 20), col] = 2\n",
- " sub_data.loc[( sub_data[col] > 20) & ( sub_data[col] <= 30), col] = 3\n",
- " sub_data.loc[( sub_data[col] > 30) & ( sub_data[col] <= 40), col] = 4\n",
- " sub_data.loc[( sub_data[col] > 40) & ( sub_data[col] <= 50), col] = 5\n",
- " sub_data.loc[( sub_data[col] > 50) & ( sub_data[col] <= 60), col] = 6\n",
- " sub_data.loc[( sub_data[col] > 60) & ( sub_data[col] <= 70), col] = 7\n",
- " sub_data.loc[( sub_data[col] > 70) & ( sub_data[col] <= 80), col] = 8\n",
- " sub_data.loc[( sub_data[col] > 80) & ( sub_data[col] <= 90), col] = 9\n",
- " sub_data.loc[( sub_data[col] > 90) & ( sub_data[col] <= 100), col] = 10\n",
- " sub_data.loc[( sub_data[col] > 100) & ( sub_data[col] <= 110), col] =11\n",
- " sub_data.loc[( sub_data[col] > 110), col] = 12\n",
- " \n",
- " \n",
- " \n",
- " "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Counts and percentages (i.e. frequency distributions) for each variable"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>country</th>\n",
- " <th>suicideper100th</th>\n",
- " <th>incomeperperson</th>\n",
- " <th>internetuserate</th>\n",
- " <th>urbanrate</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>47</th>\n",
- " <td>Cuba</td>\n",
- " <td>3.0</td>\n",
- " <td>5.0</td>\n",
- " <td>2.0</td>\n",
- " <td>8.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>119</th>\n",
- " <td>Malta</td>\n",
- " <td>1.0</td>\n",
- " <td>12.0</td>\n",
- " <td>7.0</td>\n",
- " <td>10.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>171</th>\n",
- " <td>Seychelles</td>\n",
- " <td>2.0</td>\n",
- " <td>9.0</td>\n",
- " <td>5.0</td>\n",
- " <td>6.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>142</th>\n",
- " <td>Nigeria</td>\n",
- " <td>2.0</td>\n",
- " <td>1.0</td>\n",
- " <td>3.0</td>\n",
- " <td>5.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>16</th>\n",
- " <td>Belarus</td>\n",
- " <td>6.0</td>\n",
- " <td>3.0</td>\n",
- " <td>4.0</td>\n",
- " <td>8.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>61</th>\n",
- " <td>Faeroe Islands</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>8.0</td>\n",
- " <td>5.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>68</th>\n",
- " <td>Georgia</td>\n",
- " <td>1.0</td>\n",
- " <td>2.0</td>\n",
- " <td>3.0</td>\n",
- " <td>6.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>Antigua and Barbuda</td>\n",
- " <td>1.0</td>\n",
- " <td>12.0</td>\n",
- " <td>9.0</td>\n",
- " <td>4.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>117</th>\n",
- " <td>Maldives</td>\n",
- " <td>5.0</td>\n",
- " <td>5.0</td>\n",
- " <td>3.0</td>\n",
- " <td>4.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>110</th>\n",
- " <td>Lithuania</td>\n",
- " <td>7.0</td>\n",
- " <td>6.0</td>\n",
- " <td>7.0</td>\n",
- " <td>7.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>85</th>\n",
- " <td>Iceland</td>\n",
- " <td>3.0</td>\n",
- " <td>16.0</td>\n",
- " <td>10.0</td>\n",
- " <td>10.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>72</th>\n",
- " <td>Greece</td>\n",
- " <td>1.0</td>\n",
- " <td>14.0</td>\n",
- " <td>5.0</td>\n",
- " <td>7.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>94</th>\n",
- " <td>Japan</td>\n",
- " <td>4.0</td>\n",
- " <td>16.0</td>\n",
- " <td>8.0</td>\n",
- " <td>7.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>21</th>\n",
- " <td>Bhutan</td>\n",
- " <td>4.0</td>\n",
- " <td>2.0</td>\n",
- " <td>2.0</td>\n",
- " <td>4.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>137</th>\n",
- " <td>Netherlands Antilles</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>10.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>139</th>\n",
- " <td>New Zealand</td>\n",
- " <td>3.0</td>\n",
- " <td>15.0</td>\n",
- " <td>9.0</td>\n",
- " <td>9.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>88</th>\n",
- " <td>Iran</td>\n",
- " <td>2.0</td>\n",
- " <td>3.0</td>\n",
- " <td>2.0</td>\n",
- " <td>7.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>60</th>\n",
- " <td>Ethiopia</td>\n",
- " <td>3.0</td>\n",
- " <td>1.0</td>\n",
- " <td>1.0</td>\n",
- " <td>2.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>13</th>\n",
- " <td>Bahrain</td>\n",
- " <td>1.0</td>\n",
- " <td>13.0</td>\n",
- " <td>6.0</td>\n",
- " <td>9.0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>116</th>\n",
- " <td>Malaysia</td>\n",
- " <td>2.0</td>\n",
- " <td>6.0</td>\n",
- " <td>6.0</td>\n",
- " <td>8.0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " country suicideper100th incomeperperson internetuserate \\\n",
- "47 Cuba 3.0 5.0 2.0 \n",
- "119 Malta 1.0 12.0 7.0 \n",
- "171 Seychelles 2.0 9.0 5.0 \n",
- "142 Nigeria 2.0 1.0 3.0 \n",
- "16 Belarus 6.0 3.0 4.0 \n",
- "61 Faeroe Islands NaN NaN 8.0 \n",
- "68 Georgia 1.0 2.0 3.0 \n",
- "5 Antigua and Barbuda 1.0 12.0 9.0 \n",
- "117 Maldives 5.0 5.0 3.0 \n",
- "110 Lithuania 7.0 6.0 7.0 \n",
- "85 Iceland 3.0 16.0 10.0 \n",
- "72 Greece 1.0 14.0 5.0 \n",
- "94 Japan 4.0 16.0 8.0 \n",
- "21 Bhutan 4.0 2.0 2.0 \n",
- "137 Netherlands Antilles NaN NaN NaN \n",
- "139 New Zealand 3.0 15.0 9.0 \n",
- "88 Iran 2.0 3.0 2.0 \n",
- "60 Ethiopia 3.0 1.0 1.0 \n",
- "13 Bahrain 1.0 13.0 6.0 \n",
- "116 Malaysia 2.0 6.0 6.0 \n",
- "\n",
- " urbanrate \n",
- "47 8.0 \n",
- "119 10.0 \n",
- "171 6.0 \n",
- "142 5.0 \n",
- "16 8.0 \n",
- "61 5.0 \n",
- "68 6.0 \n",
- "5 4.0 \n",
- "117 4.0 \n",
- "110 7.0 \n",
- "85 10.0 \n",
- "72 7.0 \n",
- "94 7.0 \n",
- "21 4.0 \n",
- "137 10.0 \n",
- "139 9.0 \n",
- "88 7.0 \n",
- "60 2.0 \n",
- "13 9.0 \n",
- "116 8.0 "
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.sample(20)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "counts for suicideper100th\n",
- " 2.0 65\n",
- " 3.0 51\n",
- " 1.0 49\n",
- "NaN 22\n",
- " 4.0 12\n",
- " 5.0 6\n",
- " 6.0 6\n",
- " 7.0 1\n",
- " 8.0 1\n",
- "Name: suicideper100th, dtype: int64\n",
- "percentages for suicideper100th\n",
- " 2.0 0.305164\n",
- " 3.0 0.239437\n",
- " 1.0 0.230047\n",
- "NaN 0.103286\n",
- " 4.0 0.056338\n",
- " 5.0 0.028169\n",
- " 6.0 0.028169\n",
- " 7.0 0.004695\n",
- " 8.0 0.004695\n",
- "Name: suicideper100th, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "print ('counts for suicideper100th')\n",
- "c1 = sub_data['suicideper100th'].value_counts(sort=True, dropna=False)\n",
- "print (c1)\n",
- "\n",
- "\n",
- "print ('percentages for suicideper100th')\n",
- "p1 = sub_data['suicideper100th'].value_counts(sort=True, dropna=False, normalize=True)\n",
- "print (p1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "counts for incomeperperson\n",
- " 1.0 54\n",
- " 16.0 37\n",
- " 2.0 26\n",
- "NaN 23\n",
- " 3.0 22\n",
- " 6.0 11\n",
- " 7.0 8\n",
- " 5.0 7\n",
- " 4.0 6\n",
- " 12.0 4\n",
- " 10.0 4\n",
- " 9.0 3\n",
- " 11.0 2\n",
- " 13.0 2\n",
- " 8.0 2\n",
- " 14.0 1\n",
- " 15.0 1\n",
- "Name: incomeperperson, dtype: int64\n",
- "percentages for incomeperperson\n",
- " 1.0 0.253521\n",
- " 16.0 0.173709\n",
- " 2.0 0.122066\n",
- "NaN 0.107981\n",
- " 3.0 0.103286\n",
- " 6.0 0.051643\n",
- " 7.0 0.037559\n",
- " 5.0 0.032864\n",
- " 4.0 0.028169\n",
- " 12.0 0.018779\n",
- " 10.0 0.018779\n",
- " 9.0 0.014085\n",
- " 11.0 0.009390\n",
- " 13.0 0.009390\n",
- " 8.0 0.009390\n",
- " 14.0 0.004695\n",
- " 15.0 0.004695\n",
- "Name: incomeperperson, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "print ('counts for incomeperperson')\n",
- "c2 = sub_data['incomeperperson'].value_counts(sort=True, dropna=False)\n",
- "print (c2)\n",
- "\n",
- "\n",
- "print ('percentages for incomeperperson')\n",
- "p2= sub_data['incomeperperson'].value_counts(sort=True, dropna=False, normalize=True)\n",
- "print (p2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "counts for internetuserate\n",
- " 1.0 49\n",
- " 2.0 27\n",
- " 5.0 25\n",
- "NaN 21\n",
- " 4.0 18\n",
- " 3.0 17\n",
- " 8.0 16\n",
- " 7.0 14\n",
- " 9.0 12\n",
- " 6.0 9\n",
- " 10.0 5\n",
- "Name: internetuserate, dtype: int64\n",
- "percentages for internetuserate\n",
- " 1.0 0.230047\n",
- " 2.0 0.126761\n",
- " 5.0 0.117371\n",
- "NaN 0.098592\n",
- " 4.0 0.084507\n",
- " 3.0 0.079812\n",
- " 8.0 0.075117\n",
- " 7.0 0.065728\n",
- " 9.0 0.056338\n",
- " 6.0 0.042254\n",
- " 10.0 0.023474\n",
- "Name: internetuserate, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "print ('counts for internetuserate')\n",
- "c3 = sub_data['internetuserate'].value_counts(sort=True, dropna=False)\n",
- "print (c3)\n",
- "\n",
- "\n",
- "print ('percentages for internetuserate')\n",
- "p3 = sub_data['internetuserate'].value_counts(sort=True, dropna=False, normalize=True)\n",
- "print (p3)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "counts for urbanrate\n",
- " 7.0 34\n",
- " 8.0 24\n",
- " 4.0 24\n",
- " 6.0 24\n",
- " 5.0 22\n",
- " 3.0 22\n",
- " 9.0 21\n",
- " 10.0 19\n",
- " 2.0 13\n",
- "NaN 10\n",
- "Name: urbanrate, dtype: int64\n",
- "percentages for urbanrate\n",
- " 7.0 0.159624\n",
- " 8.0 0.112676\n",
- " 4.0 0.112676\n",
- " 6.0 0.112676\n",
- " 5.0 0.103286\n",
- " 3.0 0.103286\n",
- " 9.0 0.098592\n",
- " 10.0 0.089202\n",
- " 2.0 0.061033\n",
- "NaN 0.046948\n",
- "Name: urbanrate, dtype: float64\n"
- ]
- }
- ],
- "source": [
- "print ('counts for urbanrate')\n",
- "c4 = sub_data['urbanrate'].value_counts(sort=True, dropna=False)\n",
- "print (c4)\n",
- "\n",
- "\n",
- "print ('percentages for urbanrate')\n",
- "p4 = sub_data['urbanrate'].value_counts(sort=True, dropna=False, normalize=True)\n",
- "print (p4)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>suicideper100th</th>\n",
- " <th>incomeperperson</th>\n",
- " <th>internetuserate</th>\n",
- " <th>urbanrate</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>count</th>\n",
- " <td>191.000000</td>\n",
- " <td>190.000000</td>\n",
- " <td>192.000000</td>\n",
- " <td>203.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>mean</th>\n",
- " <td>2.413613</td>\n",
- " <td>6.068421</td>\n",
- " <td>4.109375</td>\n",
- " <td>6.162562</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>std</th>\n",
- " <td>1.310533</td>\n",
- " <td>5.745074</td>\n",
- " <td>2.789944</td>\n",
- " <td>2.377322</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>min</th>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>2.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>25%</th>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>4.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>50%</th>\n",
- " <td>2.000000</td>\n",
- " <td>3.000000</td>\n",
- " <td>4.000000</td>\n",
- " <td>6.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>75%</th>\n",
- " <td>3.000000</td>\n",
- " <td>10.000000</td>\n",
- " <td>6.000000</td>\n",
- " <td>8.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>max</th>\n",
- " <td>8.000000</td>\n",
- " <td>16.000000</td>\n",
- " <td>10.000000</td>\n",
- " <td>10.000000</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " suicideper100th incomeperperson internetuserate urbanrate\n",
- "count 191.000000 190.000000 192.000000 203.000000\n",
- "mean 2.413613 6.068421 4.109375 6.162562\n",
- "std 1.310533 5.745074 2.789944 2.377322\n",
- "min 1.000000 1.000000 1.000000 2.000000\n",
- "25% 1.000000 1.000000 1.000000 4.000000\n",
- "50% 2.000000 3.000000 4.000000 6.000000\n",
- "75% 3.000000 10.000000 6.000000 8.000000\n",
- "max 8.000000 16.000000 10.000000 10.000000"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "sub_data.describe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment