Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Implementing Logistic regression for binary classifier of Graduation."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 125,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import pymysql\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "from sklearn import linear_model\n",
- "from sklearn import metrics\n",
- "from sklearn.cross_validation import train_test_split\n",
- "import sklearn\n",
- "from sklearn import preprocessing\n",
- "from sklearn import linear_model\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "from sklearn.metrics import confusion_matrix\n",
- "from sklearn.metrics import r2_score"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Making connection with local databas through sql"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 126,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "conn = pymysql.connect(host='localhost',\n",
- " user='root',\n",
- " password='root',\n",
- " db='shiree',\n",
- " charset='utf8mb4',\n",
- " cursorclass=pymysql.cursors.DictCursor)\n",
- "cursor = conn.cursor()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Using iga database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 127,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 127,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cursor.execute(\"USE iga\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Fetching all data from theTable tbl_iga"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 128,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "\"[{'HHID': '532670910101', 'LeadNGO_Code': '10', 'PNGO_Code': '', 'FirstAssetMonth': 'December', 'Fir\""
- ]
- },
- "execution_count": 128,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cursor.execute('select * from tbl_iga');\n",
- "\n",
- "iga_table = cursor.fetchall()\n",
- "str(iga_table)[0:100]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Converting list of iga_table to pandas Data Frame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 129,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "iga_dataFrame = pd.DataFrame(iga_table)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Using shiree database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 131,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0"
- ]
- },
- "execution_count": 131,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cursor.execute(\"USE shiree\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Fetching all data from the Table reportdata"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 132,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "\"[{'data_id': 1.0, 'startTime': datetime.datetime(2012, 11, 27, 22, 3, 33), 'endTime': datetime.datetime(2012, 11, 27, 22, 4, 13), 'image': None, 'isExtracted': b'\\\\x00', 'latitude': 23.8062219, 'longitude': 90.4189349, 'received': datetime.datetime(2012, 11, 27, 22, 4, 26), 'form_id': 1.0, 'ngo_id': \""
- ]
- },
- "execution_count": 132,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "cursor.execute('select * from reportdata');\n",
- "\n",
- "reportdata_table = cursor.fetchall()\n",
- "str(reportdata_table)[0:300]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Converting list of reportdata_table to pandas Data Frame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 133,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "reportdata_dataFrame = pd.DataFrame(reportdata_table)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Renaming Column HH_ID to HHID"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 134,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "reportdata_dataFrame = reportdata_dataFrame.rename(columns = {'HH_ID':'HHID'})\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Merging Data frame reportdata_dataFrame , iga_dataFrame on common column named HHID"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 135,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "merged_dataFrame = pd.merge(reportdata_dataFrame,iga_dataFrame,on='HHID')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Total rows and columns in merged_dataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 136,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(8135, 107)\n"
- ]
- }
- ],
- "source": [
- "print(merged_dataFrame.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Replacing None value with 0.0 in CompositeIndex and casting it to float\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 137,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "merged_dataFrame['CompositeIndex'] = merged_dataFrame['CompositeIndex'].fillna(0.0)\n",
- "merged_dataFrame.CompositeIndex = merged_dataFrame.CompositeIndex.astype(float)\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Making a new column named Graduation where put 1 if the relevant index of CompositIndex is greater than 2 otherwise put 0.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 138,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "merged_dataFrame['Graduation'] = np.where(merged_dataFrame['CompositeIndex']>2, 1, 0)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Total Info in merged_dataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 139,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "<class 'pandas.core.frame.DataFrame'>\n",
- "Int64Index: 8135 entries, 0 to 8134\n",
- "Columns: 108 entries, Bad_Other_audio to Graduation\n",
- "dtypes: datetime64[ns](4), float64(10), int32(1), int64(4), object(89)\n",
- "memory usage: 6.7+ MB\n"
- ]
- }
- ],
- "source": [
- "merged_dataFrame.info()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Replacing all the None value in economic Status ,confidence , happiness , income source to -1 and casting all of them into integer value "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 140,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "merged_dataFrame['shp3_econstatusComp'] = merged_dataFrame['shp3_econstatusComp'].fillna(-1)\n",
- "merged_dataFrame['shp3_confidence'] = merged_dataFrame['shp3_confidence'].fillna(-1)\n",
- "merged_dataFrame['shp3_happiness'] = merged_dataFrame['shp3_happiness'].fillna(-1)\n",
- "merged_dataFrame['shp3_incomeSources'] = merged_dataFrame['shp3_incomeSources'].fillna(-1)\n",
- "merged_dataFrame.shp3_econstatusComp = merged_dataFrame.shp3_econstatusComp.astype(int)\n",
- "merged_dataFrame.shp3_incomeSources = merged_dataFrame.shp3_incomeSources.astype(int)\n",
- "merged_dataFrame.shp3_confidence = merged_dataFrame.shp3_confidence.astype(int)\n",
- "merged_dataFrame.shp3_happiness = merged_dataFrame.shp3_happiness.astype(int)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Making a data frame using economic Status ,confidence , happiness , income source, graduation column from merged_dataFrame"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 141,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "feature_df = merged_dataFrame[['shp3_econstatusComp','shp3_confidence','shp3_happiness','shp3_incomeSources','Graduation']]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Making a matrix of numpy array using the feature_df data frame and ncol is the number of colums in the matrix"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 142,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "fX = np.array(feature_df)\n",
- "ncol = fX.shape[1]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Balancing data . The model would be imbalance as there are thousands more rows where Graduation value is zero than where Graduation is one. so taking all the rows where Graduation value is 1 and taking 1000 rows of 7000 rows where Graduation is 0."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 143,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(624, 5)\n",
- "(1000, 5)\n",
- "(1624, 5)\n"
- ]
- }
- ],
- "source": [
- "fX_1 = fX[fX[0: ,-1] == 1.0]\n",
- "print(fX_1.shape)\n",
- "\n",
- "fX_0 = fX[fX[0: , -1] == 0.0]\n",
- "fX_0 = fX_0[:1000]\n",
- "\n",
- "print(fX_0.shape)\n",
- "\n",
- "fX = np.concatenate((fX_1,fX_0),axis=0)\n",
- "\n",
- "print(fX.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Assigning X with all the rows and except the last column of fX matrix as the feature value. Assinging y with only the last column of the matrix fX as target value."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 144,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "X = fX[0:, 0:(ncol - 1)]\n",
- "y = fX[ 0:,-1]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### we can see that our model would not be biased as there is balance in the target value y. that means the number of 1 in the target y is 624 and number of 0 in y is 1000 ."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 145,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Total one in y is (624,), Total zero in y (1000,)\n"
- ]
- }
- ],
- "source": [
- "print(\"Total one in y is {}, Total zero in y {}\".format(y[y==1.0].shape,y[y==0.0].shape))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Spliting the X and y into training set and test set."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 146,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "X_train, X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Making a linear Regression classifier and fiting the training data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 147,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Logistic regression Train Accuracy :: 0.954225352113\n",
- "Logistic regression Test Accuracy :: 0.963114754098\n"
- ]
- }
- ],
- "source": [
- "print(\"Logistic regression Train Accuracy :: \", metrics.accuracy_score(y_train, lr.predict(X_train)))\n",
- "print (\"Logistic regression Test Accuracy :: \", metrics.accuracy_score(y_test, lr.predict(X_test)))\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### printing the confussion matrix of test set."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 148,
- "metadata": {
- "collapsed": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[[281 12]\n",
- " [ 6 189]]\n"
- ]
- }
- ],
- "source": [
- "y_predict = lr.predict(X_test)\n",
- "print(confusion_matrix(y_test,y_predict ))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment