Guest User

Untitled

a guest
May 26th, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 47.88 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import numpy as np\n",
  10. "import pandas as pd\n",
  11. "import matplotlib.pyplot as plt\n",
  12. "import seaborn as sns\n",
  13. "\n",
  14. "% matplotlib inline"
  15. ]
  16. },
  17. {
  18. "cell_type": "markdown",
  19. "metadata": {},
  20. "source": [
  21. "# 1. Import the training set and do basic EDA"
  22. ]
  23. },
  24. {
  25. "cell_type": "code",
  26. "execution_count": 2,
  27. "metadata": {},
  28. "outputs": [],
  29. "source": [
  30. "#import the training set"
  31. ]
  32. },
  33. {
  34. "cell_type": "code",
  35. "execution_count": 3,
  36. "metadata": {},
  37. "outputs": [],
  38. "source": [
  39. "df = pd.read_csv('train (2).csv')"
  40. ]
  41. },
  42. {
  43. "cell_type": "code",
  44. "execution_count": 4,
  45. "metadata": {},
  46. "outputs": [],
  47. "source": [
  48. "submission = pd.read_csv('test.csv')"
  49. ]
  50. },
  51. {
  52. "cell_type": "code",
  53. "execution_count": 5,
  54. "metadata": {},
  55. "outputs": [],
  56. "source": [
  57. "# some EDA"
  58. ]
  59. },
  60. {
  61. "cell_type": "code",
  62. "execution_count": 6,
  63. "metadata": {
  64. "scrolled": true
  65. },
  66. "outputs": [],
  67. "source": [
  68. "# df.shape"
  69. ]
  70. },
  71. {
  72. "cell_type": "code",
  73. "execution_count": 7,
  74. "metadata": {},
  75. "outputs": [],
  76. "source": [
  77. "#df.isnull().sum().sort_values(ascending=False)[:26] #so these 26 variables have missing values"
  78. ]
  79. },
  80. {
  81. "cell_type": "code",
  82. "execution_count": 8,
  83. "metadata": {},
  84. "outputs": [
  85. {
  86. "data": {
  87. "text/plain": [
  88. "SalePrice 0\n",
  89. "dtype: int64"
  90. ]
  91. },
  92. "execution_count": 8,
  93. "metadata": {},
  94. "output_type": "execute_result"
  95. }
  96. ],
  97. "source": [
  98. "df[['SalePrice']].isnull().sum()"
  99. ]
  100. },
  101. {
  102. "cell_type": "code",
  103. "execution_count": 9,
  104. "metadata": {},
  105. "outputs": [],
  106. "source": [
  107. "#well, good news is that at least we have all the y values"
  108. ]
  109. },
  110. {
  111. "cell_type": "code",
  112. "execution_count": 10,
  113. "metadata": {
  114. "scrolled": false
  115. },
  116. "outputs": [],
  117. "source": [
  118. "#df[df.duplicated()] #no duplicates"
  119. ]
  120. },
  121. {
  122. "cell_type": "markdown",
  123. "metadata": {},
  124. "source": [
  125. "## 2. Looking for strong correlations between price and other variables"
  126. ]
  127. },
  128. {
  129. "cell_type": "code",
  130. "execution_count": 11,
  131. "metadata": {},
  132. "outputs": [],
  133. "source": [
  134. "# I am going to look for the variables with the highest corr (positive or negative) with SalePrice"
  135. ]
  136. },
  137. {
  138. "cell_type": "code",
  139. "execution_count": 12,
  140. "metadata": {
  141. "scrolled": true
  142. },
  143. "outputs": [
  144. {
  145. "data": {
  146. "text/html": [
  147. "<div>\n",
  148. "<style scoped>\n",
  149. " .dataframe tbody tr th:only-of-type {\n",
  150. " vertical-align: middle;\n",
  151. " }\n",
  152. "\n",
  153. " .dataframe tbody tr th {\n",
  154. " vertical-align: top;\n",
  155. " }\n",
  156. "\n",
  157. " .dataframe thead th {\n",
  158. " text-align: right;\n",
  159. " }\n",
  160. "</style>\n",
  161. "<table border=\"1\" class=\"dataframe\">\n",
  162. " <thead>\n",
  163. " <tr style=\"text-align: right;\">\n",
  164. " <th></th>\n",
  165. " <th>SalePrice</th>\n",
  166. " </tr>\n",
  167. " </thead>\n",
  168. " <tbody>\n",
  169. " <tr>\n",
  170. " <th>SalePrice</th>\n",
  171. " <td>1.000000</td>\n",
  172. " </tr>\n",
  173. " <tr>\n",
  174. " <th>Overall Qual</th>\n",
  175. " <td>0.800207</td>\n",
  176. " </tr>\n",
  177. " <tr>\n",
  178. " <th>Gr Liv Area</th>\n",
  179. " <td>0.697038</td>\n",
  180. " </tr>\n",
  181. " <tr>\n",
  182. " <th>Garage Area</th>\n",
  183. " <td>0.650270</td>\n",
  184. " </tr>\n",
  185. " <tr>\n",
  186. " <th>Garage Cars</th>\n",
  187. " <td>0.648220</td>\n",
  188. " </tr>\n",
  189. " <tr>\n",
  190. " <th>Total Bsmt SF</th>\n",
  191. " <td>0.628925</td>\n",
  192. " </tr>\n",
  193. " <tr>\n",
  194. " <th>1st Flr SF</th>\n",
  195. " <td>0.618486</td>\n",
  196. " </tr>\n",
  197. " <tr>\n",
  198. " <th>Year Built</th>\n",
  199. " <td>0.571849</td>\n",
  200. " </tr>\n",
  201. " <tr>\n",
  202. " <th>Year Remod/Add</th>\n",
  203. " <td>0.550370</td>\n",
  204. " </tr>\n",
  205. " <tr>\n",
  206. " <th>Full Bath</th>\n",
  207. " <td>0.537969</td>\n",
  208. " </tr>\n",
  209. " <tr>\n",
  210. " <th>Garage Yr Blt</th>\n",
  211. " <td>0.533922</td>\n",
  212. " </tr>\n",
  213. " <tr>\n",
  214. " <th>Mas Vnr Area</th>\n",
  215. " <td>0.512230</td>\n",
  216. " </tr>\n",
  217. " <tr>\n",
  218. " <th>TotRms AbvGrd</th>\n",
  219. " <td>0.504014</td>\n",
  220. " </tr>\n",
  221. " <tr>\n",
  222. " <th>Fireplaces</th>\n",
  223. " <td>0.471093</td>\n",
  224. " </tr>\n",
  225. " <tr>\n",
  226. " <th>BsmtFin SF 1</th>\n",
  227. " <td>0.423519</td>\n",
  228. " </tr>\n",
  229. " <tr>\n",
  230. " <th>Lot Frontage</th>\n",
  231. " <td>0.341842</td>\n",
  232. " </tr>\n",
  233. " <tr>\n",
  234. " <th>Open Porch SF</th>\n",
  235. " <td>0.333476</td>\n",
  236. " </tr>\n",
  237. " <tr>\n",
  238. " <th>Wood Deck SF</th>\n",
  239. " <td>0.326490</td>\n",
  240. " </tr>\n",
  241. " <tr>\n",
  242. " <th>Lot Area</th>\n",
  243. " <td>0.296566</td>\n",
  244. " </tr>\n",
  245. " <tr>\n",
  246. " <th>Bsmt Full Bath</th>\n",
  247. " <td>0.283662</td>\n",
  248. " </tr>\n",
  249. " <tr>\n",
  250. " <th>Half Bath</th>\n",
  251. " <td>0.283001</td>\n",
  252. " </tr>\n",
  253. " <tr>\n",
  254. " <th>PID</th>\n",
  255. " <td>0.255052</td>\n",
  256. " </tr>\n",
  257. " <tr>\n",
  258. " <th>2nd Flr SF</th>\n",
  259. " <td>0.248452</td>\n",
  260. " </tr>\n",
  261. " <tr>\n",
  262. " <th>Bsmt Unf SF</th>\n",
  263. " <td>0.190210</td>\n",
  264. " </tr>\n",
  265. " <tr>\n",
  266. " <th>Bedroom AbvGr</th>\n",
  267. " <td>0.137067</td>\n",
  268. " </tr>\n",
  269. " <tr>\n",
  270. " <th>Enclosed Porch</th>\n",
  271. " <td>0.135656</td>\n",
  272. " </tr>\n",
  273. " <tr>\n",
  274. " <th>Screen Porch</th>\n",
  275. " <td>0.134581</td>\n",
  276. " </tr>\n",
  277. " <tr>\n",
  278. " <th>Kitchen AbvGr</th>\n",
  279. " <td>0.125444</td>\n",
  280. " </tr>\n",
  281. " <tr>\n",
  282. " <th>Overall Cond</th>\n",
  283. " <td>0.097019</td>\n",
  284. " </tr>\n",
  285. " <tr>\n",
  286. " <th>MS SubClass</th>\n",
  287. " <td>0.087335</td>\n",
  288. " </tr>\n",
  289. " <tr>\n",
  290. " <th>Id</th>\n",
  291. " <td>0.051398</td>\n",
  292. " </tr>\n",
  293. " <tr>\n",
  294. " <th>3Ssn Porch</th>\n",
  295. " <td>0.048732</td>\n",
  296. " </tr>\n",
  297. " <tr>\n",
  298. " <th>Bsmt Half Bath</th>\n",
  299. " <td>0.045328</td>\n",
  300. " </tr>\n",
  301. " <tr>\n",
  302. " <th>Low Qual Fin SF</th>\n",
  303. " <td>0.041594</td>\n",
  304. " </tr>\n",
  305. " <tr>\n",
  306. " <th>Mo Sold</th>\n",
  307. " <td>0.032735</td>\n",
  308. " </tr>\n",
  309. " <tr>\n",
  310. " <th>Pool Area</th>\n",
  311. " <td>0.023106</td>\n",
  312. " </tr>\n",
  313. " <tr>\n",
  314. " <th>BsmtFin SF 2</th>\n",
  315. " <td>0.016255</td>\n",
  316. " </tr>\n",
  317. " <tr>\n",
  318. " <th>Yr Sold</th>\n",
  319. " <td>0.015203</td>\n",
  320. " </tr>\n",
  321. " <tr>\n",
  322. " <th>Misc Val</th>\n",
  323. " <td>0.007375</td>\n",
  324. " </tr>\n",
  325. " </tbody>\n",
  326. "</table>\n",
  327. "</div>"
  328. ],
  329. "text/plain": [
  330. " SalePrice\n",
  331. "SalePrice 1.000000\n",
  332. "Overall Qual 0.800207\n",
  333. "Gr Liv Area 0.697038\n",
  334. "Garage Area 0.650270\n",
  335. "Garage Cars 0.648220\n",
  336. "Total Bsmt SF 0.628925\n",
  337. "1st Flr SF 0.618486\n",
  338. "Year Built 0.571849\n",
  339. "Year Remod/Add 0.550370\n",
  340. "Full Bath 0.537969\n",
  341. "Garage Yr Blt 0.533922\n",
  342. "Mas Vnr Area 0.512230\n",
  343. "TotRms AbvGrd 0.504014\n",
  344. "Fireplaces 0.471093\n",
  345. "BsmtFin SF 1 0.423519\n",
  346. "Lot Frontage 0.341842\n",
  347. "Open Porch SF 0.333476\n",
  348. "Wood Deck SF 0.326490\n",
  349. "Lot Area 0.296566\n",
  350. "Bsmt Full Bath 0.283662\n",
  351. "Half Bath 0.283001\n",
  352. "PID 0.255052\n",
  353. "2nd Flr SF 0.248452\n",
  354. "Bsmt Unf SF 0.190210\n",
  355. "Bedroom AbvGr 0.137067\n",
  356. "Enclosed Porch 0.135656\n",
  357. "Screen Porch 0.134581\n",
  358. "Kitchen AbvGr 0.125444\n",
  359. "Overall Cond 0.097019\n",
  360. "MS SubClass 0.087335\n",
  361. "Id 0.051398\n",
  362. "3Ssn Porch 0.048732\n",
  363. "Bsmt Half Bath 0.045328\n",
  364. "Low Qual Fin SF 0.041594\n",
  365. "Mo Sold 0.032735\n",
  366. "Pool Area 0.023106\n",
  367. "BsmtFin SF 2 0.016255\n",
  368. "Yr Sold 0.015203\n",
  369. "Misc Val 0.007375"
  370. ]
  371. },
  372. "execution_count": 12,
  373. "metadata": {},
  374. "output_type": "execute_result"
  375. }
  376. ],
  377. "source": [
  378. "df.corr()[['SalePrice']].apply(abs).sort_values('SalePrice', ascending=False)"
  379. ]
  380. },
  381. {
  382. "cell_type": "code",
  383. "execution_count": 13,
  384. "metadata": {
  385. "scrolled": true
  386. },
  387. "outputs": [
  388. {
  389. "data": {
  390. "text/plain": [
  391. "Index(['SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  392. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  393. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
  394. " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  395. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Bsmt Full Bath',\n",
  396. " 'Half Bath', 'PID', '2nd Flr SF', 'Bsmt Unf SF', 'Bedroom AbvGr',\n",
  397. " 'Enclosed Porch', 'Screen Porch', 'Kitchen AbvGr', 'Overall Cond',\n",
  398. " 'MS SubClass', 'Id', '3Ssn Porch', 'Bsmt Half Bath', 'Low Qual Fin SF',\n",
  399. " 'Mo Sold', 'Pool Area', 'BsmtFin SF 2', 'Yr Sold', 'Misc Val'],\n",
  400. " dtype='object')"
  401. ]
  402. },
  403. "execution_count": 13,
  404. "metadata": {},
  405. "output_type": "execute_result"
  406. }
  407. ],
  408. "source": [
  409. "df.corr()[['SalePrice']].apply(abs).sort_values('SalePrice', ascending=False).index"
  410. ]
  411. },
  412. {
  413. "cell_type": "code",
  414. "execution_count": 14,
  415. "metadata": {},
  416. "outputs": [],
  417. "source": [
  418. "# looked at the ones with corr over 30%, then out of those used all numeric and checked which ones make sense\n",
  419. "# out of those that are categorical checked which ones have enough difference in values"
  420. ]
  421. },
  422. {
  423. "cell_type": "code",
  424. "execution_count": 15,
  425. "metadata": {
  426. "scrolled": true
  427. },
  428. "outputs": [
  429. {
  430. "data": {
  431. "text/plain": [
  432. "Y 1861\n",
  433. "N 151\n",
  434. "P 39\n",
  435. "Name: Paved Drive, dtype: int64"
  436. ]
  437. },
  438. "execution_count": 15,
  439. "metadata": {},
  440. "output_type": "execute_result"
  441. }
  442. ],
  443. "source": [
  444. "# how about adding Cerntal Air, Kitchen Qual, PavedDrive\n",
  445. "df['Paved Drive'].value_counts()"
  446. ]
  447. },
  448. {
  449. "cell_type": "markdown",
  450. "metadata": {},
  451. "source": [
  452. "## 3. Picking my predictors"
  453. ]
  454. },
  455. {
  456. "cell_type": "code",
  457. "execution_count": 16,
  458. "metadata": {},
  459. "outputs": [],
  460. "source": [
  461. "# creating a new df with the top variables with the highest correlation with y\n",
  462. "# doing the same for the submission set to match up"
  463. ]
  464. },
  465. {
  466. "cell_type": "code",
  467. "execution_count": 17,
  468. "metadata": {},
  469. "outputs": [],
  470. "source": [
  471. "submission = submission[['Id','Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  472. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  473. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt','Mas Vnr Area',\n",
  474. " 'TotRms AbvGrd', 'Kitchen Qual','Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  475. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area','Central Air','Paved Drive']]"
  476. ]
  477. },
  478. {
  479. "cell_type": "code",
  480. "execution_count": 18,
  481. "metadata": {},
  482. "outputs": [],
  483. "source": [
  484. "newdf = df[['Id','SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  485. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  486. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt','Mas Vnr Area',\n",
  487. " 'TotRms AbvGrd', 'Kitchen Qual','Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  488. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area','Central Air','Paved Drive']]"
  489. ]
  490. },
  491. {
  492. "cell_type": "code",
  493. "execution_count": 19,
  494. "metadata": {
  495. "scrolled": true
  496. },
  497. "outputs": [
  498. {
  499. "data": {
  500. "text/plain": [
  501. "((2051, 23), (879, 22))"
  502. ]
  503. },
  504. "execution_count": 19,
  505. "metadata": {},
  506. "output_type": "execute_result"
  507. }
  508. ],
  509. "source": [
  510. "newdf.shape, submission.shape"
  511. ]
  512. },
  513. {
  514. "cell_type": "code",
  515. "execution_count": 20,
  516. "metadata": {},
  517. "outputs": [],
  518. "source": [
  519. "#submission has one less column because it does not have SalePrice"
  520. ]
  521. },
  522. {
  523. "cell_type": "code",
  524. "execution_count": 21,
  525. "metadata": {},
  526. "outputs": [],
  527. "source": [
  528. "#just filling missing values with zeros\n",
  529. "#assuming the missing values are missing at the same rate fromt he training and test set it should not affect the model\n",
  530. "#too much\n",
  531. "\n",
  532. "newdf = newdf.fillna(0)\n",
  533. "submission = submission.fillna(0)"
  534. ]
  535. },
  536. {
  537. "cell_type": "code",
  538. "execution_count": 22,
  539. "metadata": {},
  540. "outputs": [],
  541. "source": [
  542. "#get dummies for the three categorical predictor variables\n",
  543. "newdf = pd.get_dummies(newdf, columns=['Central Air','Kitchen Qual','Paved Drive'], drop_first=True)\n",
  544. "submission = pd.get_dummies(submission, columns=['Central Air','Kitchen Qual','Paved Drive'], drop_first=True)"
  545. ]
  546. },
  547. {
  548. "cell_type": "code",
  549. "execution_count": 23,
  550. "metadata": {},
  551. "outputs": [],
  552. "source": [
  553. "# I am a little paranoid about missing values\n",
  554. "newdf = newdf.fillna(0)\n",
  555. "submission = submission.fillna(0)"
  556. ]
  557. },
  558. {
  559. "cell_type": "code",
  560. "execution_count": 24,
  561. "metadata": {},
  562. "outputs": [],
  563. "source": [
  564. "#these two variables for some reason gave off error, so I reassigned from floats to int\n",
  565. "\n",
  566. "newdf[\"Garage Area\"] = newdf[\"Garage Area\"].astype(int)\n",
  567. "newdf[\"Total Bsmt SF\"] = newdf[\"Total Bsmt SF\"].astype(int)\n",
  568. "\n",
  569. "submission[\"Garage Area\"] = submission[\"Garage Area\"].astype(int)\n",
  570. "submission[\"Total Bsmt SF\"] = submission[\"Total Bsmt SF\"].astype(int)"
  571. ]
  572. },
  573. {
  574. "cell_type": "markdown",
  575. "metadata": {},
  576. "source": [
  577. "## 3. Add interractions"
  578. ]
  579. },
  580. {
  581. "cell_type": "code",
  582. "execution_count": 25,
  583. "metadata": {},
  584. "outputs": [],
  585. "source": [
  586. "#some variables are highly correlated between themselved (garage area and garage cars), can try interractions\n",
  587. "newdf['garage'] = newdf['Garage Area'] * newdf['Garage Cars']\n",
  588. "newdf['Sq Ft'] = newdf['1st Flr SF'] * newdf['Total Bsmt SF']\n",
  589. "\n",
  590. "submission['garage'] = submission['Garage Area'] * submission['Garage Cars']\n",
  591. "submission['Sq Ft'] = submission['1st Flr SF'] * submission['Total Bsmt SF']"
  592. ]
  593. },
  594. {
  595. "cell_type": "markdown",
  596. "metadata": {},
  597. "source": [
  598. "## 4. Create predictor and target variables. Standardize the predictors."
  599. ]
  600. },
  601. {
  602. "cell_type": "code",
  603. "execution_count": 26,
  604. "metadata": {},
  605. "outputs": [
  606. {
  607. "data": {
  608. "text/plain": [
  609. "Index(['Id', 'SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  610. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  611. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
  612. " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  613. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
  614. " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
  615. " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft'],\n",
  616. " dtype='object')"
  617. ]
  618. },
  619. "execution_count": 26,
  620. "metadata": {},
  621. "output_type": "execute_result"
  622. }
  623. ],
  624. "source": [
  625. "newdf.columns"
  626. ]
  627. },
  628. {
  629. "cell_type": "code",
  630. "execution_count": 27,
  631. "metadata": {},
  632. "outputs": [],
  633. "source": [
  634. "features = ['Id','Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  635. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  636. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
  637. " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  638. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
  639. " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
  640. " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft']\n",
  641. "\n",
  642. "#all columns from my newdf except for SalePrice\n",
  643. " \n",
  644. "X = newdf[features]\n",
  645. "y = newdf['SalePrice']"
  646. ]
  647. },
  648. {
  649. "cell_type": "code",
  650. "execution_count": 28,
  651. "metadata": {},
  652. "outputs": [],
  653. "source": [
  654. "from sklearn.preprocessing import StandardScaler\n",
  655. "scaler = StandardScaler()\n",
  656. "\n",
  657. "X = scaler.fit_transform(X)\n",
  658. "\n",
  659. "#using the same feature list for the submission set to set X and scaling it\n",
  660. "\n",
  661. "X_submission = submission[features]\n",
  662. "X_submission = scaler.transform(X_submission)"
  663. ]
  664. },
  665. {
  666. "cell_type": "markdown",
  667. "metadata": {},
  668. "source": [
  669. "## 5. TTS for the training set"
  670. ]
  671. },
  672. {
  673. "cell_type": "code",
  674. "execution_count": 29,
  675. "metadata": {
  676. "scrolled": true
  677. },
  678. "outputs": [
  679. {
  680. "name": "stdout",
  681. "output_type": "stream",
  682. "text": [
  683. "(1538, 27)\n",
  684. "(513, 27)\n"
  685. ]
  686. }
  687. ],
  688. "source": [
  689. "from sklearn.model_selection import train_test_split\n",
  690. "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)\n",
  691. "print(X_train.shape)\n",
  692. "print(X_test.shape)"
  693. ]
  694. },
  695. {
  696. "cell_type": "markdown",
  697. "metadata": {},
  698. "source": [
  699. "## 6. Finally creating my simple MLR and scoring it"
  700. ]
  701. },
  702. {
  703. "cell_type": "code",
  704. "execution_count": 30,
  705. "metadata": {
  706. "scrolled": false
  707. },
  708. "outputs": [
  709. {
  710. "name": "stdout",
  711. "output_type": "stream",
  712. "text": [
  713. "0.8621025361868271\n",
  714. "181551.05762720952\n",
  715. "[ 160.42477762 20580.971557 20789.68604804 -15612.58599389\n",
  716. " -13670.91444598 26332.43145848 10655.46307571 6926.00533633\n",
  717. " 6998.86563432 455.0580014 6980.23750138 5036.52351063\n",
  718. " -137.91517397 4770.17283565 9247.00483347 1109.72198759\n",
  719. " 79.93465596 1307.2350037 7748.79666688 -565.6949233\n",
  720. " -7884.23462386 -25267.37779968 -29275.44484881 374.55638562\n",
  721. " 1289.63776226 32298.00649838 -36201.5792096 ]\n"
  722. ]
  723. }
  724. ],
  725. "source": [
  726. "from sklearn.linear_model import LinearRegression\n",
  727. "\n",
  728. "model = LinearRegression()\n",
  729. "model.fit(X_train, y_train)\n",
  730. "print(model.score(X_train, y_train))\n",
  731. "print(model.intercept_) \n",
  732. "print(model.coef_)"
  733. ]
  734. },
  735. {
  736. "cell_type": "code",
  737. "execution_count": 31,
  738. "metadata": {},
  739. "outputs": [],
  740. "source": [
  741. "# checking that the intercept is positive, score is 86% and coefficients make sense"
  742. ]
  743. },
  744. {
  745. "cell_type": "code",
  746. "execution_count": 32,
  747. "metadata": {},
  748. "outputs": [
  749. {
  750. "data": {
  751. "text/plain": [
  752. "0.8376768479647607"
  753. ]
  754. },
  755. "execution_count": 32,
  756. "metadata": {},
  757. "output_type": "execute_result"
  758. }
  759. ],
  760. "source": [
  761. "#cross-validate\n",
  762. "from sklearn.model_selection import cross_val_score\n",
  763. "cross_val_score(model, X_train, y_train, cv = 10).mean()"
  764. ]
  765. },
  766. {
  767. "cell_type": "code",
  768. "execution_count": 33,
  769. "metadata": {},
  770. "outputs": [],
  771. "source": [
  772. "# cv score is not that far off model performance score\n",
  773. "# try on the test set"
  774. ]
  775. },
  776. {
  777. "cell_type": "code",
  778. "execution_count": 34,
  779. "metadata": {},
  780. "outputs": [
  781. {
  782. "data": {
  783. "text/plain": [
  784. "0.8754011139284391"
  785. ]
  786. },
  787. "execution_count": 34,
  788. "metadata": {},
  789. "output_type": "execute_result"
  790. }
  791. ],
  792. "source": [
  793. "model.score(X_test, y_test)"
  794. ]
  795. },
  796. {
  797. "cell_type": "code",
  798. "execution_count": 55,
  799. "metadata": {},
  800. "outputs": [
  801. {
  802. "data": {
  803. "text/plain": [
  804. "array([124657.83797359, 233764.2025863 , 254367.39261074, 137446.72146039,\n",
  805. " 197344.58755117, 356142.9112147 , 160137.82757228, 311736.50122965,\n",
  806. " 171365.29778086, 81264.8676615 , 146448.02916319, 197410.09000769,\n",
  807. " 164974.86513526, 105107.90282686, 115053.07566765, 88830.51842119,\n",
  808. " 122484.81538231, 203201.40899428, 318984.54628359, 234744.60911307,\n",
  809. " 203904.51598581, 200768.48955219, 151210.8844595 , 113697.18153046,\n",
  810. " 157016.21134263, 237536.76572715, 168149.35330444, 121177.01740145,\n",
  811. " 247234.82342365, 113923.46942071, 358714.57925481, 107283.99453105,\n",
  812. " 120302.77111721, 182227.28136153, 198223.88439228, 79387.44304974,\n",
  813. " 167173.57489099, 231983.24821569, 308283.5710916 , 111798.75256198,\n",
  814. " 222992.10786668, 196391.3124267 , 292190.11720976, 183132.87943924,\n",
  815. " 82881.94232308, 173539.89291091, 112472.27437398, 94254.81153454,\n",
  816. " 293843.66021332, 141219.03807486, 374732.34038708, 63315.95270037,\n",
  817. " 117642.33594604, 216981.41350565, 153602.196533 , 105973.2243344 ,\n",
  818. " 242576.68949559, 196898.92493288, 125734.59767359, 172544.01827374,\n",
  819. " 263429.26662111, 46084.96328723, 181917.2596768 , 234150.21093609,\n",
  820. " 202473.35734056, 70098.65744871, 258148.03771344, 254956.6309359 ,\n",
  821. " 281755.32731966, 143632.79215952, 225249.6763857 , 316286.61102583,\n",
  822. " 277540.00572174, 179746.58404379, 123277.48367933, 209238.50848278,\n",
  823. " 208893.07067318, 149452.96104988, 160145.75996906, 310821.47641634,\n",
  824. " 124814.18064332, 119930.03286122, 298633.30575022, 159974.89311164,\n",
  825. " 112565.03259461, 104685.62948878, 115234.43370453, 227191.27818579,\n",
  826. " 67401.6298356 , 161964.23372381, 182187.96841228, 181846.58660351,\n",
  827. " 131156.81486101, 234330.56471751, 141979.15808302, 201345.62656676,\n",
  828. " 124703.85050689, 104745.83902015, 217374.65681401, 121498.78598703,\n",
  829. " 137862.68572143, 356166.55766044, 63295.54678353, 201135.7613614 ,\n",
  830. " 168641.94441875, 133144.99440749, 109045.53821833, 139705.27183219,\n",
  831. " 166940.16620052, 161602.05938393, 50451.97008826, 184054.20892553,\n",
  832. " 121585.44712225, 308413.41882712, 104794.47181733, 277226.79016093,\n",
  833. " 203993.24169746, 140521.16110067, 113620.97036663, 174269.15339756,\n",
  834. " 201652.81576382, 120965.50521131, 220615.27471873, 139831.5591772 ,\n",
  835. " 242915.13433874, 221523.61003021, 137969.20848692, 265955.25247273,\n",
  836. " 210587.74592602, 139611.79978723, 182291.8707243 , 123501.31649194,\n",
  837. " 159910.92255653, 361761.49946479, 144915.33186961, 173485.84368592,\n",
  838. " 79049.13663093, 161704.60302244, 187357.28345383, 148421.30784224,\n",
  839. " 53655.96082153, -5494.02972267, 132003.88862812, 76446.04464881,\n",
  840. " 245032.33932096, 217115.58037993, 400281.42470578, 184993.93695407,\n",
  841. " 117908.71208643, 163148.51488006, 80434.05427133, 150812.31577396,\n",
  842. " 132725.36100823, 295669.28245206, 102020.34214941, 183823.59898147,\n",
  843. " 219554.36188301, 141665.51668123, 129910.12660383, 188936.66836479,\n",
  844. " 126353.39607288, 252613.95445249, 135187.44351232, 172409.41337177,\n",
  845. " 81912.89760925, 198004.96484163, 127432.98473552, 200152.53554167,\n",
  846. " 122913.46102595, 307690.18408268, 223042.9387275 , 134024.68885368,\n",
  847. " 136343.05684234, 198102.27535371, 182963.36111638, 135158.3046069 ,\n",
  848. " 282506.20836892, 116962.59954744, 124452.1482912 , 102220.96386219,\n",
  849. " 215549.89173431, 151077.46694906, 162028.18339559, 202009.5716362 ,\n",
  850. " 123780.98037663, 160422.06139329, 159834.41266095, 129550.59302622,\n",
  851. " 151958.22182078, 175214.70195228, 220805.44568633, 119093.55697729,\n",
  852. " 147459.9021361 , 23609.41451905, 206232.16485731, 169958.85723586,\n",
  853. " 84108.69067598, 322123.75160603, 185379.76192374, 255047.21749802,\n",
  854. " 180369.6395729 , 109755.67100029, 177718.22207938, 148785.99493586,\n",
  855. " 120041.81151044, 110461.19337785, 194281.44626587, 196428.15820093,\n",
  856. " 140656.97276161, 246808.52122294, 121133.89332516, 407042.70610713,\n",
  857. " 144656.10321034, 158689.11477807, 359043.88653588, 203300.26657901,\n",
  858. " 264256.77604319, 169661.26950468, 283831.97819516, 135172.62912717,\n",
  859. " 98967.84685143, 163140.81126162, 161068.24102463, 187372.45153675,\n",
  860. " 95901.2397569 , 237899.05281083, 212137.08791207, 111953.3818717 ,\n",
  861. " 90941.53272891, 212985.61718494, 127565.71705964, 94662.37841506,\n",
  862. " 176632.80376506, 159025.63087976, 162651.5729982 , 83855.3390145 ,\n",
  863. " 201481.49137122, 59720.98812191, 146307.75108669, 180690.77135571,\n",
  864. " 235191.48004871, 147205.54297057, 112369.91654491, 163362.66128596,\n",
  865. " 136313.12194262, 190827.27540306, 358022.10950645, 194416.4327504 ,\n",
  866. " 172418.59397773, 259408.51228121, 225725.5972678 , 358190.43300976,\n",
  867. " 208473.9523822 , 199513.13762309, 410391.87222628, 143984.50096201,\n",
  868. " 180935.36566372, 100651.42058065, 380801.33374619, 218441.62475428,\n",
  869. " 305127.45914049, 83098.73647553, 196075.25203208, 176903.23828028,\n",
  870. " 355781.89304333, 160576.06414294, 217801.14308403, 213133.02868385,\n",
  871. " 119662.88697066, 330390.18104478, 175760.70505834, 157059.9499516 ,\n",
  872. " 77351.47976046, 309710.80339115, 170985.27305021, 338255.70285471,\n",
  873. " 305977.81198948, 197530.98359305, 182573.17908066, 154060.11692098,\n",
  874. " 91407.32405395, 186289.44118977, 186364.91798186, 185521.92932942,\n",
  875. " 11688.18069185, 273377.41157666, 77881.04295746, 95141.91316589,\n",
  876. " 115646.37905933, 145469.8770625 , 192182.46825271, 176464.06055369,\n",
  877. " 131750.63079031, 176466.27269551, 173174.26967938, 174173.18078469,\n",
  878. " 88650.36537463, 99808.48717825, 177221.60426115, 208911.70144379,\n",
  879. " 132097.5268803 , 317432.60793305, 246741.64119622, 182440.25193786,\n",
  880. " 178446.40349208, 237073.77070552, 153726.94035933, 169368.43007245,\n",
  881. " 187203.42504016, 182045.00723434, 124699.10992039, 96626.0860931 ,\n",
  882. " 188826.09692933, 136305.79087987, 142716.0901991 , 220400.06357049,\n",
  883. " 164012.33608787, 111708.77908851, 285375.66554193, 171888.87651805,\n",
  884. " 191666.39250057, 121541.27948664, 157468.53085798, 154957.24524095,\n",
  885. " 240766.14744621, 107713.43463302, 149108.13023492, 190147.14575301,\n",
  886. " 253279.75926354, 58768.52927957, 137823.29878865, 163557.87310245,\n",
  887. " 214792.28347583, 248569.61140424, 123089.96907005, 118542.61254051,\n",
  888. " 380241.32901023, 185467.89902433, 320531.71092592, 198141.80362753,\n",
  889. " 198478.11584753, 146896.80034552, 173912.05934481, 270442.49437023,\n",
  890. " 205490.64010563, 188825.52654707, 133233.17958404, 253667.87487414,\n",
  891. " 89889.93482285, 83017.36520753, 78125.93704976, 119300.54912959,\n",
  892. " 120814.76818627, 334275.0251345 , 304585.77697805, 169732.91822669,\n",
  893. " 375399.67570528, 252959.94782699, 7635.89020229, 180708.61211982,\n",
  894. " 196216.79217908, 361648.37702624, 162202.88759165, 151973.58931809,\n",
  895. " 205228.43938192, 215215.48738271, 228245.13968458, 192092.6612177 ,\n",
  896. " 224009.87177872, 265939.5682791 , 124190.16258513, 124535.06222545,\n",
  897. " 213891.57757138, 200580.94266151, 129701.87010673, 328969.92529661,\n",
  898. " 127616.29057409, 178214.89889451, 192128.31327859, 163944.55676737,\n",
  899. " 81336.31896423, 209171.32599572, 306959.92605085, 208429.7469269 ,\n",
  900. " 229702.14985188, 102093.12400235, 155852.24499003, 224425.23663134,\n",
  901. " 148800.15785812, 135990.02609349, 137295.46252351, 326361.49966217,\n",
  902. " 184541.98207679, 208113.3729331 , 265762.79991441, 175335.96591688,\n",
  903. " 161364.22046569, 284695.7522194 , 199073.11908501, 105105.36600987,\n",
  904. " 133141.62368594, 143566.77466175, 180385.8318132 , 194936.49291044,\n",
  905. " 192139.38399504, 201465.39686604, 202543.08087238, 25075.3506406 ,\n",
  906. " 257240.4327088 , 307277.20859791, 335738.22106316, 206784.47480231,\n",
  907. " 282854.09966046, 203055.34076483, 267080.78533884, 180607.57566504,\n",
  908. " 188924.40247689, 109301.77006503, 179538.35758517, 113185.57075429,\n",
  909. " 350668.85008696, 233047.85659837, 125049.58259053, 222748.91077882,\n",
  910. " 95599.42593969, 229747.3942105 , 103208.53186525, 139180.48780632,\n",
  911. " 309535.18194862, 239868.32094768, 262238.05780108, 251068.20766102,\n",
  912. " 71520.8631212 , 188762.94030874, 135854.58481016, 142700.35284738,\n",
  913. " 267882.00135667, 279267.46153769, 229349.55051675, 107359.92894866,\n",
  914. " 139821.99885774, 120275.90022229, 130280.15194458, 104826.70854003,\n",
  915. " 293208.08107508, 291146.81738048, 163790.90047027, 200037.77082319,\n",
  916. " 220459.22009246, 277315.09856244, 233455.12131475, 128736.50316072,\n",
  917. " 164521.2846689 , 186818.26505066, 158487.37704472, 65388.43056146,\n",
  918. " 469036.96918948, 287269.24357311, 315944.80979861, 289266.57784408,\n",
  919. " 193073.15654037, 126619.28261932, 228147.83723082, 200193.46479468,\n",
  920. " 167159.89921329, 166690.85749096, 109545.37760615, 208812.27796227,\n",
  921. " 204473.72375096, 168482.30360486, 217599.00367594, 73144.11793953,\n",
  922. " 379001.42098757, 128492.60965501, 129027.037483 , 352752.15673675,\n",
  923. " 261622.0204418 , 85981.04768673, 133950.42964152, 221298.18375014,\n",
  924. " 120078.2647881 , 100429.18990959, 198981.37985234, 110961.25478684,\n",
  925. " 136870.46673106, 346216.39487824, 125569.2921395 , 303893.79354058,\n",
  926. " 200505.30482216, 211505.39188718, 148682.40961178, 171772.81401003,\n",
  927. " 226517.67381888, 125683.24241152, 123194.39793531, 169408.23175902,\n",
  928. " 137911.5974203 , 79175.92823114, 185712.94482788, 186758.37651702,\n",
  929. " 130331.55358299, 209377.72141439, 254721.04534741, 152905.66997492,\n",
  930. " 128570.05153632, 214707.4023475 , 312380.11503756, 140472.52913344,\n",
  931. " 163913.36638207, 89084.80433478, 144439.61909276, 126477.04034433,\n",
  932. " 250156.73291305])"
  933. ]
  934. },
  935. "execution_count": 55,
  936. "metadata": {},
  937. "output_type": "execute_result"
  938. }
  939. ],
  940. "source": [
  941. "#making the predictions on my training test set to make sure the predictions actually make sense (not negative, for ex)\n",
  942. "model.predict(X_test)"
  943. ]
  944. },
  945. {
  946. "cell_type": "markdown",
  947. "metadata": {},
  948. "source": [
  949. "## 7. Applying the model to the kaggle test set"
  950. ]
  951. },
  952. {
  953. "cell_type": "code",
  954. "execution_count": 36,
  955. "metadata": {},
  956. "outputs": [],
  957. "source": [
  958. "y_submission = model.predict(X_submission)"
  959. ]
  960. },
  961. {
  962. "cell_type": "code",
  963. "execution_count": 37,
  964. "metadata": {
  965. "scrolled": true
  966. },
  967. "outputs": [
  968. {
  969. "data": {
  970. "text/plain": [
  971. "array([152491.21191136, 161327.4366651 , 205327.7427502 , 106147.05857414,\n",
  972. " 179755.97594702])"
  973. ]
  974. },
  975. "execution_count": 37,
  976. "metadata": {},
  977. "output_type": "execute_result"
  978. }
  979. ],
  980. "source": [
  981. "y_submission[0:5] #making sure that the results make sense"
  982. ]
  983. },
  984. {
  985. "cell_type": "markdown",
  986. "metadata": {},
  987. "source": [
  988. "## 8. Checking for column mismatch and fixing it"
  989. ]
  990. },
  991. {
  992. "cell_type": "markdown",
  993. "metadata": {},
  994. "source": [
  995. "##### Credit goes to Ki-Hoon, who basically gave me the code."
  996. ]
  997. },
  998. {
  999. "cell_type": "code",
  1000. "execution_count": 38,
  1001. "metadata": {
  1002. "scrolled": false
  1003. },
  1004. "outputs": [
  1005. {
  1006. "data": {
  1007. "text/plain": [
  1008. "['SalePrice']"
  1009. ]
  1010. },
  1011. "execution_count": 38,
  1012. "metadata": {},
  1013. "output_type": "execute_result"
  1014. }
  1015. ],
  1016. "source": [
  1017. "[col for col in newdf.columns if col not in submission.columns]"
  1018. ]
  1019. },
  1020. {
  1021. "cell_type": "code",
  1022. "execution_count": 39,
  1023. "metadata": {
  1024. "scrolled": true
  1025. },
  1026. "outputs": [
  1027. {
  1028. "data": {
  1029. "text/plain": [
  1030. "['Kitchen Qual_Po']"
  1031. ]
  1032. },
  1033. "execution_count": 39,
  1034. "metadata": {},
  1035. "output_type": "execute_result"
  1036. }
  1037. ],
  1038. "source": [
  1039. "[col for col in submission.columns if col not in newdf.columns]"
  1040. ]
  1041. },
  1042. {
  1043. "cell_type": "code",
  1044. "execution_count": 40,
  1045. "metadata": {},
  1046. "outputs": [],
  1047. "source": [
  1048. "all_cols = newdf.columns.union(submission.columns)\n",
  1049. "\n",
  1050. "newdf = newdf.assign(**{col:0 for col in all_cols.difference(newdf.columns).tolist()})\n",
  1051. "submission = submission.assign(**{col:0 for col in all_cols.difference(submission.columns).tolist()})"
  1052. ]
  1053. },
  1054. {
  1055. "cell_type": "markdown",
  1056. "metadata": {},
  1057. "source": [
  1058. "#### Confirming that columns now match, and putting them in the same order\n",
  1059. "### Ben's code could be useful here..."
  1060. ]
  1061. },
  1062. {
  1063. "cell_type": "code",
  1064. "execution_count": 41,
  1065. "metadata": {
  1066. "scrolled": false
  1067. },
  1068. "outputs": [
  1069. {
  1070. "data": {
  1071. "text/plain": [
  1072. "((2051, 29),\n",
  1073. " Index(['Id', 'SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
  1074. " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
  1075. " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
  1076. " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
  1077. " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
  1078. " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
  1079. " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft', 'Kitchen Qual_Po'],\n",
  1080. " dtype='object'))"
  1081. ]
  1082. },
  1083. "execution_count": 41,
  1084. "metadata": {},
  1085. "output_type": "execute_result"
  1086. }
  1087. ],
  1088. "source": [
  1089. "newdf.shape, newdf.columns"
  1090. ]
  1091. },
  1092. {
  1093. "cell_type": "code",
  1094. "execution_count": 42,
  1095. "metadata": {
  1096. "scrolled": false
  1097. },
  1098. "outputs": [
  1099. {
  1100. "data": {
  1101. "text/plain": [
  1102. "((879, 29),\n",
  1103. " Index(['Id', 'Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars',\n",
  1104. " 'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Year Remod/Add',\n",
  1105. " 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area', 'TotRms AbvGrd',\n",
  1106. " 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage', 'Open Porch SF',\n",
  1107. " 'Wood Deck SF', 'Lot Area', 'Central Air_Y', 'Kitchen Qual_Fa',\n",
  1108. " 'Kitchen Qual_Gd', 'Kitchen Qual_Po', 'Kitchen Qual_TA',\n",
  1109. " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft', 'SalePrice'],\n",
  1110. " dtype='object'))"
  1111. ]
  1112. },
  1113. "execution_count": 42,
  1114. "metadata": {},
  1115. "output_type": "execute_result"
  1116. }
  1117. ],
  1118. "source": [
  1119. "submission.shape, submission.columns"
  1120. ]
  1121. },
  1122. {
  1123. "cell_type": "code",
  1124. "execution_count": 43,
  1125. "metadata": {
  1126. "scrolled": true
  1127. },
  1128. "outputs": [
  1129. {
  1130. "data": {
  1131. "text/plain": [
  1132. "(879, 29)"
  1133. ]
  1134. },
  1135. "execution_count": 43,
  1136. "metadata": {},
  1137. "output_type": "execute_result"
  1138. }
  1139. ],
  1140. "source": [
  1141. "submission.shape"
  1142. ]
  1143. },
  1144. {
  1145. "cell_type": "code",
  1146. "execution_count": 44,
  1147. "metadata": {},
  1148. "outputs": [],
  1149. "source": [
  1150. "submission = submission[newdf.columns]"
  1151. ]
  1152. },
  1153. {
  1154. "cell_type": "code",
  1155. "execution_count": 45,
  1156. "metadata": {},
  1157. "outputs": [],
  1158. "source": [
  1159. "assert (submission.columns == newdf.columns).all().all()"
  1160. ]
  1161. },
  1162. {
  1163. "cell_type": "code",
  1164. "execution_count": 46,
  1165. "metadata": {
  1166. "scrolled": true
  1167. },
  1168. "outputs": [
  1169. {
  1170. "data": {
  1171. "text/html": [
  1172. "<div>\n",
  1173. "<style scoped>\n",
  1174. " .dataframe tbody tr th:only-of-type {\n",
  1175. " vertical-align: middle;\n",
  1176. " }\n",
  1177. "\n",
  1178. " .dataframe tbody tr th {\n",
  1179. " vertical-align: top;\n",
  1180. " }\n",
  1181. "\n",
  1182. " .dataframe thead th {\n",
  1183. " text-align: right;\n",
  1184. " }\n",
  1185. "</style>\n",
  1186. "<table border=\"1\" class=\"dataframe\">\n",
  1187. " <thead>\n",
  1188. " <tr style=\"text-align: right;\">\n",
  1189. " <th></th>\n",
  1190. " <th>Id</th>\n",
  1191. " <th>SalePrice</th>\n",
  1192. " <th>Overall Qual</th>\n",
  1193. " <th>Gr Liv Area</th>\n",
  1194. " <th>Garage Area</th>\n",
  1195. " <th>Garage Cars</th>\n",
  1196. " <th>Total Bsmt SF</th>\n",
  1197. " <th>1st Flr SF</th>\n",
  1198. " <th>Year Built</th>\n",
  1199. " <th>Year Remod/Add</th>\n",
  1200. " <th>...</th>\n",
  1201. " <th>Lot Area</th>\n",
  1202. " <th>Central Air_Y</th>\n",
  1203. " <th>Kitchen Qual_Fa</th>\n",
  1204. " <th>Kitchen Qual_Gd</th>\n",
  1205. " <th>Kitchen Qual_TA</th>\n",
  1206. " <th>Paved Drive_P</th>\n",
  1207. " <th>Paved Drive_Y</th>\n",
  1208. " <th>garage</th>\n",
  1209. " <th>Sq Ft</th>\n",
  1210. " <th>Kitchen Qual_Po</th>\n",
  1211. " </tr>\n",
  1212. " </thead>\n",
  1213. " <tbody>\n",
  1214. " <tr>\n",
  1215. " <th>0</th>\n",
  1216. " <td>109</td>\n",
  1217. " <td>130500</td>\n",
  1218. " <td>6</td>\n",
  1219. " <td>1479</td>\n",
  1220. " <td>475</td>\n",
  1221. " <td>2.0</td>\n",
  1222. " <td>725</td>\n",
  1223. " <td>725</td>\n",
  1224. " <td>1976</td>\n",
  1225. " <td>2005</td>\n",
  1226. " <td>...</td>\n",
  1227. " <td>13517</td>\n",
  1228. " <td>1</td>\n",
  1229. " <td>0</td>\n",
  1230. " <td>1</td>\n",
  1231. " <td>0</td>\n",
  1232. " <td>0</td>\n",
  1233. " <td>1</td>\n",
  1234. " <td>950.0</td>\n",
  1235. " <td>525625</td>\n",
  1236. " <td>0</td>\n",
  1237. " </tr>\n",
  1238. " </tbody>\n",
  1239. "</table>\n",
  1240. "<p>1 rows Γ— 29 columns</p>\n",
  1241. "</div>"
  1242. ],
  1243. "text/plain": [
  1244. " Id SalePrice Overall Qual Gr Liv Area Garage Area Garage Cars \\\n",
  1245. "0 109 130500 6 1479 475 2.0 \n",
  1246. "\n",
  1247. " Total Bsmt SF 1st Flr SF Year Built Year Remod/Add ... \\\n",
  1248. "0 725 725 1976 2005 ... \n",
  1249. "\n",
  1250. " Lot Area Central Air_Y Kitchen Qual_Fa Kitchen Qual_Gd Kitchen Qual_TA \\\n",
  1251. "0 13517 1 0 1 0 \n",
  1252. "\n",
  1253. " Paved Drive_P Paved Drive_Y garage Sq Ft Kitchen Qual_Po \n",
  1254. "0 0 1 950.0 525625 0 \n",
  1255. "\n",
  1256. "[1 rows x 29 columns]"
  1257. ]
  1258. },
  1259. "execution_count": 46,
  1260. "metadata": {},
  1261. "output_type": "execute_result"
  1262. }
  1263. ],
  1264. "source": [
  1265. "newdf.head(1)"
  1266. ]
  1267. },
  1268. {
  1269. "cell_type": "code",
  1270. "execution_count": 47,
  1271. "metadata": {},
  1272. "outputs": [
  1273. {
  1274. "data": {
  1275. "text/html": [
  1276. "<div>\n",
  1277. "<style scoped>\n",
  1278. " .dataframe tbody tr th:only-of-type {\n",
  1279. " vertical-align: middle;\n",
  1280. " }\n",
  1281. "\n",
  1282. " .dataframe tbody tr th {\n",
  1283. " vertical-align: top;\n",
  1284. " }\n",
  1285. "\n",
  1286. " .dataframe thead th {\n",
  1287. " text-align: right;\n",
  1288. " }\n",
  1289. "</style>\n",
  1290. "<table border=\"1\" class=\"dataframe\">\n",
  1291. " <thead>\n",
  1292. " <tr style=\"text-align: right;\">\n",
  1293. " <th></th>\n",
  1294. " <th>Id</th>\n",
  1295. " <th>SalePrice</th>\n",
  1296. " <th>Overall Qual</th>\n",
  1297. " <th>Gr Liv Area</th>\n",
  1298. " <th>Garage Area</th>\n",
  1299. " <th>Garage Cars</th>\n",
  1300. " <th>Total Bsmt SF</th>\n",
  1301. " <th>1st Flr SF</th>\n",
  1302. " <th>Year Built</th>\n",
  1303. " <th>Year Remod/Add</th>\n",
  1304. " <th>...</th>\n",
  1305. " <th>Lot Area</th>\n",
  1306. " <th>Central Air_Y</th>\n",
  1307. " <th>Kitchen Qual_Fa</th>\n",
  1308. " <th>Kitchen Qual_Gd</th>\n",
  1309. " <th>Kitchen Qual_TA</th>\n",
  1310. " <th>Paved Drive_P</th>\n",
  1311. " <th>Paved Drive_Y</th>\n",
  1312. " <th>garage</th>\n",
  1313. " <th>Sq Ft</th>\n",
  1314. " <th>Kitchen Qual_Po</th>\n",
  1315. " </tr>\n",
  1316. " </thead>\n",
  1317. " <tbody>\n",
  1318. " <tr>\n",
  1319. " <th>0</th>\n",
  1320. " <td>2658</td>\n",
  1321. " <td>0</td>\n",
  1322. " <td>6</td>\n",
  1323. " <td>1928</td>\n",
  1324. " <td>440</td>\n",
  1325. " <td>1.0</td>\n",
  1326. " <td>1020</td>\n",
  1327. " <td>908</td>\n",
  1328. " <td>1910</td>\n",
  1329. " <td>1950</td>\n",
  1330. " <td>...</td>\n",
  1331. " <td>9142</td>\n",
  1332. " <td>0</td>\n",
  1333. " <td>1</td>\n",
  1334. " <td>0</td>\n",
  1335. " <td>0</td>\n",
  1336. " <td>0</td>\n",
  1337. " <td>1</td>\n",
  1338. " <td>440.0</td>\n",
  1339. " <td>926160</td>\n",
  1340. " <td>0</td>\n",
  1341. " </tr>\n",
  1342. " </tbody>\n",
  1343. "</table>\n",
  1344. "<p>1 rows Γ— 29 columns</p>\n",
  1345. "</div>"
  1346. ],
  1347. "text/plain": [
  1348. " Id SalePrice Overall Qual Gr Liv Area Garage Area Garage Cars \\\n",
  1349. "0 2658 0 6 1928 440 1.0 \n",
  1350. "\n",
  1351. " Total Bsmt SF 1st Flr SF Year Built Year Remod/Add ... \\\n",
  1352. "0 1020 908 1910 1950 ... \n",
  1353. "\n",
  1354. " Lot Area Central Air_Y Kitchen Qual_Fa Kitchen Qual_Gd Kitchen Qual_TA \\\n",
  1355. "0 9142 0 1 0 0 \n",
  1356. "\n",
  1357. " Paved Drive_P Paved Drive_Y garage Sq Ft Kitchen Qual_Po \n",
  1358. "0 0 1 440.0 926160 0 \n",
  1359. "\n",
  1360. "[1 rows x 29 columns]"
  1361. ]
  1362. },
  1363. "execution_count": 47,
  1364. "metadata": {},
  1365. "output_type": "execute_result"
  1366. }
  1367. ],
  1368. "source": [
  1369. "submission.head(1)"
  1370. ]
  1371. },
  1372. {
  1373. "cell_type": "markdown",
  1374. "metadata": {},
  1375. "source": [
  1376. "### 9. Adding the predicted SalesPrices to the submission set"
  1377. ]
  1378. },
  1379. {
  1380. "cell_type": "code",
  1381. "execution_count": 48,
  1382. "metadata": {},
  1383. "outputs": [],
  1384. "source": [
  1385. "submission['SalePrice'] = y_submission"
  1386. ]
  1387. },
  1388. {
  1389. "cell_type": "markdown",
  1390. "metadata": {},
  1391. "source": [
  1392. "### 10. Subsetting the two required columns. "
  1393. ]
  1394. },
  1395. {
  1396. "cell_type": "code",
  1397. "execution_count": 49,
  1398. "metadata": {},
  1399. "outputs": [],
  1400. "source": [
  1401. "submission1 = submission[['Id','SalePrice']]"
  1402. ]
  1403. },
  1404. {
  1405. "cell_type": "code",
  1406. "execution_count": 50,
  1407. "metadata": {
  1408. "scrolled": false
  1409. },
  1410. "outputs": [
  1411. {
  1412. "data": {
  1413. "text/html": [
  1414. "<div>\n",
  1415. "<style scoped>\n",
  1416. " .dataframe tbody tr th:only-of-type {\n",
  1417. " vertical-align: middle;\n",
  1418. " }\n",
  1419. "\n",
  1420. " .dataframe tbody tr th {\n",
  1421. " vertical-align: top;\n",
  1422. " }\n",
  1423. "\n",
  1424. " .dataframe thead th {\n",
  1425. " text-align: right;\n",
  1426. " }\n",
  1427. "</style>\n",
  1428. "<table border=\"1\" class=\"dataframe\">\n",
  1429. " <thead>\n",
  1430. " <tr style=\"text-align: right;\">\n",
  1431. " <th></th>\n",
  1432. " <th>Id</th>\n",
  1433. " <th>SalePrice</th>\n",
  1434. " </tr>\n",
  1435. " </thead>\n",
  1436. " <tbody>\n",
  1437. " <tr>\n",
  1438. " <th>0</th>\n",
  1439. " <td>2658</td>\n",
  1440. " <td>152491.211911</td>\n",
  1441. " </tr>\n",
  1442. " </tbody>\n",
  1443. "</table>\n",
  1444. "</div>"
  1445. ],
  1446. "text/plain": [
  1447. " Id SalePrice\n",
  1448. "0 2658 152491.211911"
  1449. ]
  1450. },
  1451. "execution_count": 50,
  1452. "metadata": {},
  1453. "output_type": "execute_result"
  1454. }
  1455. ],
  1456. "source": [
  1457. "submission1.head(1)"
  1458. ]
  1459. },
  1460. {
  1461. "cell_type": "code",
  1462. "execution_count": 51,
  1463. "metadata": {},
  1464. "outputs": [],
  1465. "source": [
  1466. "# I have the index column (so it's 3 columns instead of 2)\n",
  1467. "# I know there is a way to just drop index while saving to csv, but I couldn't figure out how\n",
  1468. "# so dropping it by hand"
  1469. ]
  1470. },
  1471. {
  1472. "cell_type": "code",
  1473. "execution_count": 52,
  1474. "metadata": {},
  1475. "outputs": [],
  1476. "source": [
  1477. "submission1.set_index('Id', inplace=True)"
  1478. ]
  1479. },
  1480. {
  1481. "cell_type": "code",
  1482. "execution_count": 53,
  1483. "metadata": {},
  1484. "outputs": [
  1485. {
  1486. "data": {
  1487. "text/html": [
  1488. "<div>\n",
  1489. "<style scoped>\n",
  1490. " .dataframe tbody tr th:only-of-type {\n",
  1491. " vertical-align: middle;\n",
  1492. " }\n",
  1493. "\n",
  1494. " .dataframe tbody tr th {\n",
  1495. " vertical-align: top;\n",
  1496. " }\n",
  1497. "\n",
  1498. " .dataframe thead th {\n",
  1499. " text-align: right;\n",
  1500. " }\n",
  1501. "</style>\n",
  1502. "<table border=\"1\" class=\"dataframe\">\n",
  1503. " <thead>\n",
  1504. " <tr style=\"text-align: right;\">\n",
  1505. " <th></th>\n",
  1506. " <th>SalePrice</th>\n",
  1507. " </tr>\n",
  1508. " <tr>\n",
  1509. " <th>Id</th>\n",
  1510. " <th></th>\n",
  1511. " </tr>\n",
  1512. " </thead>\n",
  1513. " <tbody>\n",
  1514. " <tr>\n",
  1515. " <th>2658</th>\n",
  1516. " <td>152491.211911</td>\n",
  1517. " </tr>\n",
  1518. " </tbody>\n",
  1519. "</table>\n",
  1520. "</div>"
  1521. ],
  1522. "text/plain": [
  1523. " SalePrice\n",
  1524. "Id \n",
  1525. "2658 152491.211911"
  1526. ]
  1527. },
  1528. "execution_count": 53,
  1529. "metadata": {},
  1530. "output_type": "execute_result"
  1531. }
  1532. ],
  1533. "source": [
  1534. "submission1.head(1)"
  1535. ]
  1536. },
  1537. {
  1538. "cell_type": "code",
  1539. "execution_count": 54,
  1540. "metadata": {},
  1541. "outputs": [],
  1542. "source": [
  1543. "#submission1.to_csv('submission3.csv')"
  1544. ]
  1545. }
  1546. ],
  1547. "metadata": {
  1548. "kernelspec": {
  1549. "display_name": "Python [conda env:dsi]",
  1550. "language": "python",
  1551. "name": "conda-env-dsi-py"
  1552. },
  1553. "language_info": {
  1554. "codemirror_mode": {
  1555. "name": "ipython",
  1556. "version": 3
  1557. },
  1558. "file_extension": ".py",
  1559. "mimetype": "text/x-python",
  1560. "name": "python",
  1561. "nbconvert_exporter": "python",
  1562. "pygments_lexer": "ipython3",
  1563. "version": "3.6.5"
  1564. }
  1565. },
  1566. "nbformat": 4,
  1567. "nbformat_minor": 2
  1568. }
Add Comment
Please, Sign In to add comment