Untitled

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Implementing Logistic regression for binary classifier of Graduation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pymysql\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import linear_model\n",
    "from sklearn import metrics\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import sklearn\n",
    "from sklearn import preprocessing\n",
    "from sklearn import linear_model\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import r2_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making connection with local databas through sql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "conn =  pymysql.connect(host='localhost',\n",
    "                             user='root',\n",
    "                             password='root',\n",
    "                             db='shiree',\n",
    "                             charset='utf8mb4',\n",
    "                             cursorclass=pymysql.cursors.DictCursor)\n",
    "cursor = conn.cursor()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using iga database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.execute(\"USE iga\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fetching all data from theTable tbl_iga"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"[{'HHID': '532670910101', 'LeadNGO_Code': '10', 'PNGO_Code': '', 'FirstAssetMonth': 'December', 'Fir\""
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.execute('select *  from tbl_iga');\n",
    "\n",
    "iga_table = cursor.fetchall()\n",
    "str(iga_table)[0:100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting  list of iga_table to pandas Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "iga_dataFrame = pd.DataFrame(iga_table)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using shiree database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.execute(\"USE shiree\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fetching all data from the Table reportdata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"[{'data_id': 1.0, 'startTime': datetime.datetime(2012, 11, 27, 22, 3, 33), 'endTime': datetime.datetime(2012, 11, 27, 22, 4, 13), 'image': None, 'isExtracted': b'\\\\x00', 'latitude': 23.8062219, 'longitude': 90.4189349, 'received': datetime.datetime(2012, 11, 27, 22, 4, 26), 'form_id': 1.0, 'ngo_id': \""
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.execute('select *  from reportdata');\n",
    "\n",
    "reportdata_table = cursor.fetchall()\n",
    "str(reportdata_table)[0:300]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting  list of reportdata_table to pandas Data Frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "reportdata_dataFrame = pd.DataFrame(reportdata_table)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Renaming Column HH_ID to HHID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "reportdata_dataFrame = reportdata_dataFrame.rename(columns = {'HH_ID':'HHID'})\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging Data frame reportdata_dataFrame , iga_dataFrame on common column named HHID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "merged_dataFrame = pd.merge(reportdata_dataFrame,iga_dataFrame,on='HHID')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total rows and columns in merged_dataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8135, 107)\n"
     ]
    }
   ],
   "source": [
    "print(merged_dataFrame.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###   Replacing None value with 0.0 in CompositeIndex and casting it to float\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "merged_dataFrame['CompositeIndex'] = merged_dataFrame['CompositeIndex'].fillna(0.0)\n",
    "merged_dataFrame.CompositeIndex = merged_dataFrame.CompositeIndex.astype(float)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making a new column named Graduation where put 1 if the relevant index of CompositIndex is greater than 2 otherwise put 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "merged_dataFrame['Graduation'] = np.where(merged_dataFrame['CompositeIndex']>2, 1, 0)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total Info in merged_dataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 8135 entries, 0 to 8134\n",
      "Columns: 108 entries, Bad_Other_audio to Graduation\n",
      "dtypes: datetime64[ns](4), float64(10), int32(1), int64(4), object(89)\n",
      "memory usage: 6.7+ MB\n"
     ]
    }
   ],
   "source": [
    "merged_dataFrame.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Replacing all the None value in economic Status ,confidence , happiness , income source to -1 and casting all of them into integer value "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "merged_dataFrame['shp3_econstatusComp'] =  merged_dataFrame['shp3_econstatusComp'].fillna(-1)\n",
    "merged_dataFrame['shp3_confidence'] =  merged_dataFrame['shp3_confidence'].fillna(-1)\n",
    "merged_dataFrame['shp3_happiness'] =  merged_dataFrame['shp3_happiness'].fillna(-1)\n",
    "merged_dataFrame['shp3_incomeSources'] =  merged_dataFrame['shp3_incomeSources'].fillna(-1)\n",
    "merged_dataFrame.shp3_econstatusComp = merged_dataFrame.shp3_econstatusComp.astype(int)\n",
    "merged_dataFrame.shp3_incomeSources = merged_dataFrame.shp3_incomeSources.astype(int)\n",
    "merged_dataFrame.shp3_confidence = merged_dataFrame.shp3_confidence.astype(int)\n",
    "merged_dataFrame.shp3_happiness = merged_dataFrame.shp3_happiness.astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making a data frame using economic Status ,confidence , happiness , income source, graduation column from merged_dataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_df = merged_dataFrame[['shp3_econstatusComp','shp3_confidence','shp3_happiness','shp3_incomeSources','Graduation']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making a matrix of numpy array using the feature_df data frame and ncol is the number of colums in the matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fX = np.array(feature_df)\n",
    "ncol = fX.shape[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Balancing data . The model would be imbalance as there are thousands  more rows where Graduation value is zero than where Graduation is one. so taking all the rows where Graduation value is 1 and taking 1000 rows of 7000 rows where Graduation is 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(624, 5)\n",
      "(1000, 5)\n",
      "(1624, 5)\n"
     ]
    }
   ],
   "source": [
    "fX_1 = fX[fX[0: ,-1] == 1.0]\n",
    "print(fX_1.shape)\n",
    "\n",
    "fX_0 = fX[fX[0: , -1] == 0.0]\n",
    "fX_0 = fX_0[:1000]\n",
    "\n",
    "print(fX_0.shape)\n",
    "\n",
    "fX = np.concatenate((fX_1,fX_0),axis=0)\n",
    "\n",
    "print(fX.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Assigning X with all the rows and except the last column of fX matrix as the feature value. Assinging y with only the last column of the matrix fX as target value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = fX[0:, 0:(ncol - 1)]\n",
    "y = fX[ 0:,-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### we can see that our model would not be biased as there is balance in the target value y. that means the number of 1 in the target y is 624 and number of 0 in y is 1000 ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total one in y is (624,), Total zero in y (1000,)\n"
     ]
    }
   ],
   "source": [
    "print(\"Total one in y is {}, Total zero in y {}\".format(y[y==1.0].shape,y[y==0.0].shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spliting the X and y into training set and test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Making a linear Regression classifier and fiting the training data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logistic regression Train Accuracy ::  0.954225352113\n",
      "Logistic regression Test Accuracy ::  0.963114754098\n"
     ]
    }
   ],
   "source": [
    "print(\"Logistic regression Train Accuracy :: \", metrics.accuracy_score(y_train, lr.predict(X_train)))\n",
    "print (\"Logistic regression Test Accuracy :: \", metrics.accuracy_score(y_test, lr.predict(X_test)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### printing the confussion matrix of test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[281  12]\n",
      " [  6 189]]\n"
     ]
    }
   ],
   "source": [
    "y_predict = lr.predict(X_test)\n",
    "print(confusion_matrix(y_test,y_predict ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}