Guest User

Untitled

a guest
Mar 5th, 2018
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.20 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "### Implementing Logistic regression for binary classifier of Graduation."
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 125,
  13. "metadata": {
  14. "collapsed": false
  15. },
  16. "outputs": [],
  17. "source": [
  18. "import pymysql\n",
  19. "import pandas as pd\n",
  20. "import numpy as np\n",
  21. "from sklearn import linear_model\n",
  22. "from sklearn import metrics\n",
  23. "from sklearn.cross_validation import train_test_split\n",
  24. "import sklearn\n",
  25. "from sklearn import preprocessing\n",
  26. "from sklearn import linear_model\n",
  27. "from sklearn.linear_model import LogisticRegression\n",
  28. "from sklearn.metrics import confusion_matrix\n",
  29. "from sklearn.metrics import r2_score"
  30. ]
  31. },
  32. {
  33. "cell_type": "markdown",
  34. "metadata": {},
  35. "source": [
  36. "### Making connection with local databas through sql"
  37. ]
  38. },
  39. {
  40. "cell_type": "code",
  41. "execution_count": 126,
  42. "metadata": {
  43. "collapsed": false
  44. },
  45. "outputs": [],
  46. "source": [
  47. "conn = pymysql.connect(host='localhost',\n",
  48. " user='root',\n",
  49. " password='root',\n",
  50. " db='shiree',\n",
  51. " charset='utf8mb4',\n",
  52. " cursorclass=pymysql.cursors.DictCursor)\n",
  53. "cursor = conn.cursor()"
  54. ]
  55. },
  56. {
  57. "cell_type": "markdown",
  58. "metadata": {},
  59. "source": [
  60. "### Using iga database"
  61. ]
  62. },
  63. {
  64. "cell_type": "code",
  65. "execution_count": 127,
  66. "metadata": {
  67. "collapsed": false
  68. },
  69. "outputs": [
  70. {
  71. "data": {
  72. "text/plain": [
  73. "0"
  74. ]
  75. },
  76. "execution_count": 127,
  77. "metadata": {},
  78. "output_type": "execute_result"
  79. }
  80. ],
  81. "source": [
  82. "cursor.execute(\"USE iga\")"
  83. ]
  84. },
  85. {
  86. "cell_type": "markdown",
  87. "metadata": {},
  88. "source": [
  89. "### Fetching all data from theTable tbl_iga"
  90. ]
  91. },
  92. {
  93. "cell_type": "code",
  94. "execution_count": 128,
  95. "metadata": {
  96. "collapsed": false
  97. },
  98. "outputs": [
  99. {
  100. "data": {
  101. "text/plain": [
  102. "\"[{'HHID': '532670910101', 'LeadNGO_Code': '10', 'PNGO_Code': '', 'FirstAssetMonth': 'December', 'Fir\""
  103. ]
  104. },
  105. "execution_count": 128,
  106. "metadata": {},
  107. "output_type": "execute_result"
  108. }
  109. ],
  110. "source": [
  111. "cursor.execute('select * from tbl_iga');\n",
  112. "\n",
  113. "iga_table = cursor.fetchall()\n",
  114. "str(iga_table)[0:100]"
  115. ]
  116. },
  117. {
  118. "cell_type": "markdown",
  119. "metadata": {},
  120. "source": [
  121. "### Converting list of iga_table to pandas Data Frame"
  122. ]
  123. },
  124. {
  125. "cell_type": "code",
  126. "execution_count": 129,
  127. "metadata": {
  128. "collapsed": true
  129. },
  130. "outputs": [],
  131. "source": [
  132. "iga_dataFrame = pd.DataFrame(iga_table)"
  133. ]
  134. },
  135. {
  136. "cell_type": "markdown",
  137. "metadata": {},
  138. "source": [
  139. "### Using shiree database"
  140. ]
  141. },
  142. {
  143. "cell_type": "code",
  144. "execution_count": 131,
  145. "metadata": {
  146. "collapsed": false
  147. },
  148. "outputs": [
  149. {
  150. "data": {
  151. "text/plain": [
  152. "0"
  153. ]
  154. },
  155. "execution_count": 131,
  156. "metadata": {},
  157. "output_type": "execute_result"
  158. }
  159. ],
  160. "source": [
  161. "cursor.execute(\"USE shiree\")"
  162. ]
  163. },
  164. {
  165. "cell_type": "markdown",
  166. "metadata": {},
  167. "source": [
  168. "### Fetching all data from the Table reportdata"
  169. ]
  170. },
  171. {
  172. "cell_type": "code",
  173. "execution_count": 132,
  174. "metadata": {
  175. "collapsed": false
  176. },
  177. "outputs": [
  178. {
  179. "data": {
  180. "text/plain": [
  181. "\"[{'data_id': 1.0, 'startTime': datetime.datetime(2012, 11, 27, 22, 3, 33), 'endTime': datetime.datetime(2012, 11, 27, 22, 4, 13), 'image': None, 'isExtracted': b'\\\\x00', 'latitude': 23.8062219, 'longitude': 90.4189349, 'received': datetime.datetime(2012, 11, 27, 22, 4, 26), 'form_id': 1.0, 'ngo_id': \""
  182. ]
  183. },
  184. "execution_count": 132,
  185. "metadata": {},
  186. "output_type": "execute_result"
  187. }
  188. ],
  189. "source": [
  190. "cursor.execute('select * from reportdata');\n",
  191. "\n",
  192. "reportdata_table = cursor.fetchall()\n",
  193. "str(reportdata_table)[0:300]"
  194. ]
  195. },
  196. {
  197. "cell_type": "markdown",
  198. "metadata": {},
  199. "source": [
  200. "### Converting list of reportdata_table to pandas Data Frame"
  201. ]
  202. },
  203. {
  204. "cell_type": "code",
  205. "execution_count": 133,
  206. "metadata": {
  207. "collapsed": true
  208. },
  209. "outputs": [],
  210. "source": [
  211. "reportdata_dataFrame = pd.DataFrame(reportdata_table)"
  212. ]
  213. },
  214. {
  215. "cell_type": "markdown",
  216. "metadata": {},
  217. "source": [
  218. "### Renaming Column HH_ID to HHID"
  219. ]
  220. },
  221. {
  222. "cell_type": "code",
  223. "execution_count": 134,
  224. "metadata": {
  225. "collapsed": true
  226. },
  227. "outputs": [],
  228. "source": [
  229. "reportdata_dataFrame = reportdata_dataFrame.rename(columns = {'HH_ID':'HHID'})\n"
  230. ]
  231. },
  232. {
  233. "cell_type": "markdown",
  234. "metadata": {},
  235. "source": [
  236. "### Merging Data frame reportdata_dataFrame , iga_dataFrame on common column named HHID"
  237. ]
  238. },
  239. {
  240. "cell_type": "code",
  241. "execution_count": 135,
  242. "metadata": {
  243. "collapsed": true
  244. },
  245. "outputs": [],
  246. "source": [
  247. "merged_dataFrame = pd.merge(reportdata_dataFrame,iga_dataFrame,on='HHID')"
  248. ]
  249. },
  250. {
  251. "cell_type": "markdown",
  252. "metadata": {},
  253. "source": [
  254. "### Total rows and columns in merged_dataFrame"
  255. ]
  256. },
  257. {
  258. "cell_type": "code",
  259. "execution_count": 136,
  260. "metadata": {
  261. "collapsed": false
  262. },
  263. "outputs": [
  264. {
  265. "name": "stdout",
  266. "output_type": "stream",
  267. "text": [
  268. "(8135, 107)\n"
  269. ]
  270. }
  271. ],
  272. "source": [
  273. "print(merged_dataFrame.shape)"
  274. ]
  275. },
  276. {
  277. "cell_type": "markdown",
  278. "metadata": {},
  279. "source": [
  280. "### Replacing None value with 0.0 in CompositeIndex and casting it to float\n"
  281. ]
  282. },
  283. {
  284. "cell_type": "code",
  285. "execution_count": 137,
  286. "metadata": {
  287. "collapsed": true
  288. },
  289. "outputs": [],
  290. "source": [
  291. "merged_dataFrame['CompositeIndex'] = merged_dataFrame['CompositeIndex'].fillna(0.0)\n",
  292. "merged_dataFrame.CompositeIndex = merged_dataFrame.CompositeIndex.astype(float)\n",
  293. "\n"
  294. ]
  295. },
  296. {
  297. "cell_type": "markdown",
  298. "metadata": {},
  299. "source": [
  300. "### Making a new column named Graduation where put 1 if the relevant index of CompositIndex is greater than 2 otherwise put 0.0"
  301. ]
  302. },
  303. {
  304. "cell_type": "code",
  305. "execution_count": 138,
  306. "metadata": {
  307. "collapsed": false
  308. },
  309. "outputs": [],
  310. "source": [
  311. "merged_dataFrame['Graduation'] = np.where(merged_dataFrame['CompositeIndex']>2, 1, 0)\n"
  312. ]
  313. },
  314. {
  315. "cell_type": "markdown",
  316. "metadata": {},
  317. "source": [
  318. "### Total Info in merged_dataFrame"
  319. ]
  320. },
  321. {
  322. "cell_type": "code",
  323. "execution_count": 139,
  324. "metadata": {
  325. "collapsed": false
  326. },
  327. "outputs": [
  328. {
  329. "name": "stdout",
  330. "output_type": "stream",
  331. "text": [
  332. "<class 'pandas.core.frame.DataFrame'>\n",
  333. "Int64Index: 8135 entries, 0 to 8134\n",
  334. "Columns: 108 entries, Bad_Other_audio to Graduation\n",
  335. "dtypes: datetime64[ns](4), float64(10), int32(1), int64(4), object(89)\n",
  336. "memory usage: 6.7+ MB\n"
  337. ]
  338. }
  339. ],
  340. "source": [
  341. "merged_dataFrame.info()"
  342. ]
  343. },
  344. {
  345. "cell_type": "markdown",
  346. "metadata": {},
  347. "source": [
  348. "### Replacing all the None value in economic Status ,confidence , happiness , income source to -1 and casting all of them into integer value "
  349. ]
  350. },
  351. {
  352. "cell_type": "code",
  353. "execution_count": 140,
  354. "metadata": {
  355. "collapsed": false
  356. },
  357. "outputs": [],
  358. "source": [
  359. "merged_dataFrame['shp3_econstatusComp'] = merged_dataFrame['shp3_econstatusComp'].fillna(-1)\n",
  360. "merged_dataFrame['shp3_confidence'] = merged_dataFrame['shp3_confidence'].fillna(-1)\n",
  361. "merged_dataFrame['shp3_happiness'] = merged_dataFrame['shp3_happiness'].fillna(-1)\n",
  362. "merged_dataFrame['shp3_incomeSources'] = merged_dataFrame['shp3_incomeSources'].fillna(-1)\n",
  363. "merged_dataFrame.shp3_econstatusComp = merged_dataFrame.shp3_econstatusComp.astype(int)\n",
  364. "merged_dataFrame.shp3_incomeSources = merged_dataFrame.shp3_incomeSources.astype(int)\n",
  365. "merged_dataFrame.shp3_confidence = merged_dataFrame.shp3_confidence.astype(int)\n",
  366. "merged_dataFrame.shp3_happiness = merged_dataFrame.shp3_happiness.astype(int)"
  367. ]
  368. },
  369. {
  370. "cell_type": "markdown",
  371. "metadata": {},
  372. "source": [
  373. "### Making a data frame using economic Status ,confidence , happiness , income source, graduation column from merged_dataFrame"
  374. ]
  375. },
  376. {
  377. "cell_type": "code",
  378. "execution_count": 141,
  379. "metadata": {
  380. "collapsed": true
  381. },
  382. "outputs": [],
  383. "source": [
  384. "feature_df = merged_dataFrame[['shp3_econstatusComp','shp3_confidence','shp3_happiness','shp3_incomeSources','Graduation']]"
  385. ]
  386. },
  387. {
  388. "cell_type": "markdown",
  389. "metadata": {},
  390. "source": [
  391. "### Making a matrix of numpy array using the feature_df data frame and ncol is the number of colums in the matrix"
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "execution_count": 142,
  397. "metadata": {
  398. "collapsed": true
  399. },
  400. "outputs": [],
  401. "source": [
  402. "fX = np.array(feature_df)\n",
  403. "ncol = fX.shape[1]"
  404. ]
  405. },
  406. {
  407. "cell_type": "markdown",
  408. "metadata": {},
  409. "source": [
  410. "### Balancing data . The model would be imbalance as there are thousands more rows where Graduation value is zero than where Graduation is one. so taking all the rows where Graduation value is 1 and taking 1000 rows of 7000 rows where Graduation is 0."
  411. ]
  412. },
  413. {
  414. "cell_type": "code",
  415. "execution_count": 143,
  416. "metadata": {
  417. "collapsed": false
  418. },
  419. "outputs": [
  420. {
  421. "name": "stdout",
  422. "output_type": "stream",
  423. "text": [
  424. "(624, 5)\n",
  425. "(1000, 5)\n",
  426. "(1624, 5)\n"
  427. ]
  428. }
  429. ],
  430. "source": [
  431. "fX_1 = fX[fX[0: ,-1] == 1.0]\n",
  432. "print(fX_1.shape)\n",
  433. "\n",
  434. "fX_0 = fX[fX[0: , -1] == 0.0]\n",
  435. "fX_0 = fX_0[:1000]\n",
  436. "\n",
  437. "print(fX_0.shape)\n",
  438. "\n",
  439. "fX = np.concatenate((fX_1,fX_0),axis=0)\n",
  440. "\n",
  441. "print(fX.shape)"
  442. ]
  443. },
  444. {
  445. "cell_type": "markdown",
  446. "metadata": {},
  447. "source": [
  448. "### Assigning X with all the rows and except the last column of fX matrix as the feature value. Assinging y with only the last column of the matrix fX as target value."
  449. ]
  450. },
  451. {
  452. "cell_type": "code",
  453. "execution_count": 144,
  454. "metadata": {
  455. "collapsed": true
  456. },
  457. "outputs": [],
  458. "source": [
  459. "X = fX[0:, 0:(ncol - 1)]\n",
  460. "y = fX[ 0:,-1]"
  461. ]
  462. },
  463. {
  464. "cell_type": "markdown",
  465. "metadata": {},
  466. "source": [
  467. "### we can see that our model would not be biased as there is balance in the target value y. that means the number of 1 in the target y is 624 and number of 0 in y is 1000 ."
  468. ]
  469. },
  470. {
  471. "cell_type": "code",
  472. "execution_count": 145,
  473. "metadata": {
  474. "collapsed": false
  475. },
  476. "outputs": [
  477. {
  478. "name": "stdout",
  479. "output_type": "stream",
  480. "text": [
  481. "Total one in y is (624,), Total zero in y (1000,)\n"
  482. ]
  483. }
  484. ],
  485. "source": [
  486. "print(\"Total one in y is {}, Total zero in y {}\".format(y[y==1.0].shape,y[y==0.0].shape))"
  487. ]
  488. },
  489. {
  490. "cell_type": "markdown",
  491. "metadata": {},
  492. "source": [
  493. "### Spliting the X and y into training set and test set."
  494. ]
  495. },
  496. {
  497. "cell_type": "code",
  498. "execution_count": 146,
  499. "metadata": {
  500. "collapsed": true
  501. },
  502. "outputs": [],
  503. "source": [
  504. "X_train, X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)"
  505. ]
  506. },
  507. {
  508. "cell_type": "markdown",
  509. "metadata": {},
  510. "source": [
  511. "### Making a linear Regression classifier and fiting the training data."
  512. ]
  513. },
  514. {
  515. "cell_type": "code",
  516. "execution_count": 147,
  517. "metadata": {
  518. "collapsed": false
  519. },
  520. "outputs": [
  521. {
  522. "name": "stdout",
  523. "output_type": "stream",
  524. "text": [
  525. "Logistic regression Train Accuracy :: 0.954225352113\n",
  526. "Logistic regression Test Accuracy :: 0.963114754098\n"
  527. ]
  528. }
  529. ],
  530. "source": [
  531. "print(\"Logistic regression Train Accuracy :: \", metrics.accuracy_score(y_train, lr.predict(X_train)))\n",
  532. "print (\"Logistic regression Test Accuracy :: \", metrics.accuracy_score(y_test, lr.predict(X_test)))\n"
  533. ]
  534. },
  535. {
  536. "cell_type": "markdown",
  537. "metadata": {},
  538. "source": [
  539. "### printing the confussion matrix of test set."
  540. ]
  541. },
  542. {
  543. "cell_type": "code",
  544. "execution_count": 148,
  545. "metadata": {
  546. "collapsed": false
  547. },
  548. "outputs": [
  549. {
  550. "name": "stdout",
  551. "output_type": "stream",
  552. "text": [
  553. "[[281 12]\n",
  554. " [ 6 189]]\n"
  555. ]
  556. }
  557. ],
  558. "source": [
  559. "y_predict = lr.predict(X_test)\n",
  560. "print(confusion_matrix(y_test,y_predict ))"
  561. ]
  562. },
  563. {
  564. "cell_type": "code",
  565. "execution_count": null,
  566. "metadata": {
  567. "collapsed": true
  568. },
  569. "outputs": [],
  570. "source": []
  571. }
  572. ],
  573. "metadata": {
  574. "kernelspec": {
  575. "display_name": "Python 3",
  576. "language": "python",
  577. "name": "python3"
  578. },
  579. "language_info": {
  580. "codemirror_mode": {
  581. "name": "ipython",
  582. "version": 3
  583. },
  584. "file_extension": ".py",
  585. "mimetype": "text/x-python",
  586. "name": "python",
  587. "nbconvert_exporter": "python",
  588. "pygments_lexer": "ipython3",
  589. "version": "3.6.0"
  590. }
  591. },
  592. "nbformat": 4,
  593. "nbformat_minor": 2
  594. }
Add Comment
Please, Sign In to add comment