Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "\n",
- "% matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 1. Import the training set and do basic EDA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "#import the training set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_csv('train (2).csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission = pd.read_csv('test.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "# some EDA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "# df.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "#df.isnull().sum().sort_values(ascending=False)[:26] #so these 26 variables have missing values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "SalePrice 0\n",
- "dtype: int64"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[['SalePrice']].isnull().sum()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "#well, good news is that at least we have all the y values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "scrolled": false
- },
- "outputs": [],
- "source": [
- "#df[df.duplicated()] #no duplicates"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Looking for strong correlations between price and other variables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# I am going to look for the variables with the highest corr (positive or negative) with SalePrice"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>SalePrice</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>SalePrice</th>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Overall Qual</th>\n",
- " <td>0.800207</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Gr Liv Area</th>\n",
- " <td>0.697038</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Garage Area</th>\n",
- " <td>0.650270</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Garage Cars</th>\n",
- " <td>0.648220</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Total Bsmt SF</th>\n",
- " <td>0.628925</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1st Flr SF</th>\n",
- " <td>0.618486</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Year Built</th>\n",
- " <td>0.571849</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Year Remod/Add</th>\n",
- " <td>0.550370</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Full Bath</th>\n",
- " <td>0.537969</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Garage Yr Blt</th>\n",
- " <td>0.533922</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Mas Vnr Area</th>\n",
- " <td>0.512230</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>TotRms AbvGrd</th>\n",
- " <td>0.504014</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Fireplaces</th>\n",
- " <td>0.471093</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>BsmtFin SF 1</th>\n",
- " <td>0.423519</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Lot Frontage</th>\n",
- " <td>0.341842</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Open Porch SF</th>\n",
- " <td>0.333476</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Wood Deck SF</th>\n",
- " <td>0.326490</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Lot Area</th>\n",
- " <td>0.296566</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Bsmt Full Bath</th>\n",
- " <td>0.283662</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Half Bath</th>\n",
- " <td>0.283001</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>PID</th>\n",
- " <td>0.255052</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2nd Flr SF</th>\n",
- " <td>0.248452</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Bsmt Unf SF</th>\n",
- " <td>0.190210</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Bedroom AbvGr</th>\n",
- " <td>0.137067</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Enclosed Porch</th>\n",
- " <td>0.135656</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Screen Porch</th>\n",
- " <td>0.134581</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Kitchen AbvGr</th>\n",
- " <td>0.125444</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Overall Cond</th>\n",
- " <td>0.097019</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>MS SubClass</th>\n",
- " <td>0.087335</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Id</th>\n",
- " <td>0.051398</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3Ssn Porch</th>\n",
- " <td>0.048732</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Bsmt Half Bath</th>\n",
- " <td>0.045328</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Low Qual Fin SF</th>\n",
- " <td>0.041594</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Mo Sold</th>\n",
- " <td>0.032735</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Pool Area</th>\n",
- " <td>0.023106</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>BsmtFin SF 2</th>\n",
- " <td>0.016255</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Yr Sold</th>\n",
- " <td>0.015203</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Misc Val</th>\n",
- " <td>0.007375</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " SalePrice\n",
- "SalePrice 1.000000\n",
- "Overall Qual 0.800207\n",
- "Gr Liv Area 0.697038\n",
- "Garage Area 0.650270\n",
- "Garage Cars 0.648220\n",
- "Total Bsmt SF 0.628925\n",
- "1st Flr SF 0.618486\n",
- "Year Built 0.571849\n",
- "Year Remod/Add 0.550370\n",
- "Full Bath 0.537969\n",
- "Garage Yr Blt 0.533922\n",
- "Mas Vnr Area 0.512230\n",
- "TotRms AbvGrd 0.504014\n",
- "Fireplaces 0.471093\n",
- "BsmtFin SF 1 0.423519\n",
- "Lot Frontage 0.341842\n",
- "Open Porch SF 0.333476\n",
- "Wood Deck SF 0.326490\n",
- "Lot Area 0.296566\n",
- "Bsmt Full Bath 0.283662\n",
- "Half Bath 0.283001\n",
- "PID 0.255052\n",
- "2nd Flr SF 0.248452\n",
- "Bsmt Unf SF 0.190210\n",
- "Bedroom AbvGr 0.137067\n",
- "Enclosed Porch 0.135656\n",
- "Screen Porch 0.134581\n",
- "Kitchen AbvGr 0.125444\n",
- "Overall Cond 0.097019\n",
- "MS SubClass 0.087335\n",
- "Id 0.051398\n",
- "3Ssn Porch 0.048732\n",
- "Bsmt Half Bath 0.045328\n",
- "Low Qual Fin SF 0.041594\n",
- "Mo Sold 0.032735\n",
- "Pool Area 0.023106\n",
- "BsmtFin SF 2 0.016255\n",
- "Yr Sold 0.015203\n",
- "Misc Val 0.007375"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.corr()[['SalePrice']].apply(abs).sort_values('SalePrice', ascending=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Bsmt Full Bath',\n",
- " 'Half Bath', 'PID', '2nd Flr SF', 'Bsmt Unf SF', 'Bedroom AbvGr',\n",
- " 'Enclosed Porch', 'Screen Porch', 'Kitchen AbvGr', 'Overall Cond',\n",
- " 'MS SubClass', 'Id', '3Ssn Porch', 'Bsmt Half Bath', 'Low Qual Fin SF',\n",
- " 'Mo Sold', 'Pool Area', 'BsmtFin SF 2', 'Yr Sold', 'Misc Val'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.corr()[['SalePrice']].apply(abs).sort_values('SalePrice', ascending=False).index"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "# looked at the ones with corr over 30%, then out of those used all numeric and checked which ones make sense\n",
- "# out of those that are categorical checked which ones have enough difference in values"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Y 1861\n",
- "N 151\n",
- "P 39\n",
- "Name: Paved Drive, dtype: int64"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# how about adding Cerntal Air, Kitchen Qual, PavedDrive\n",
- "df['Paved Drive'].value_counts()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Picking my predictors"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "# creating a new df with the top variables with the highest correlation with y\n",
- "# doing the same for the submission set to match up"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission = submission[['Id','Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt','Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Kitchen Qual','Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area','Central Air','Paved Drive']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "newdf = df[['Id','SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt','Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Kitchen Qual','Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area','Central Air','Paved Drive']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "((2051, 23), (879, 22))"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "newdf.shape, submission.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "#submission has one less column because it does not have SalePrice"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "#just filling missing values with zeros\n",
- "#assuming the missing values are missing at the same rate fromt he training and test set it should not affect the model\n",
- "#too much\n",
- "\n",
- "newdf = newdf.fillna(0)\n",
- "submission = submission.fillna(0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "#get dummies for the three categorical predictor variables\n",
- "newdf = pd.get_dummies(newdf, columns=['Central Air','Kitchen Qual','Paved Drive'], drop_first=True)\n",
- "submission = pd.get_dummies(submission, columns=['Central Air','Kitchen Qual','Paved Drive'], drop_first=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# I am a little paranoid about missing values\n",
- "newdf = newdf.fillna(0)\n",
- "submission = submission.fillna(0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "#these two variables for some reason gave off error, so I reassigned from floats to int\n",
- "\n",
- "newdf[\"Garage Area\"] = newdf[\"Garage Area\"].astype(int)\n",
- "newdf[\"Total Bsmt SF\"] = newdf[\"Total Bsmt SF\"].astype(int)\n",
- "\n",
- "submission[\"Garage Area\"] = submission[\"Garage Area\"].astype(int)\n",
- "submission[\"Total Bsmt SF\"] = submission[\"Total Bsmt SF\"].astype(int)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 3. Add interractions"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "#some variables are highly correlated between themselved (garage area and garage cars), can try interractions\n",
- "newdf['garage'] = newdf['Garage Area'] * newdf['Garage Cars']\n",
- "newdf['Sq Ft'] = newdf['1st Flr SF'] * newdf['Total Bsmt SF']\n",
- "\n",
- "submission['garage'] = submission['Garage Area'] * submission['Garage Cars']\n",
- "submission['Sq Ft'] = submission['1st Flr SF'] * submission['Total Bsmt SF']"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 4. Create predictor and target variables. Standardize the predictors."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['Id', 'SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
- " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
- " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "newdf.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "features = ['Id','Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
- " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
- " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft']\n",
- "\n",
- "#all columns from my newdf except for SalePrice\n",
- " \n",
- "X = newdf[features]\n",
- "y = newdf['SalePrice']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.preprocessing import StandardScaler\n",
- "scaler = StandardScaler()\n",
- "\n",
- "X = scaler.fit_transform(X)\n",
- "\n",
- "#using the same feature list for the submission set to set X and scaling it\n",
- "\n",
- "X_submission = submission[features]\n",
- "X_submission = scaler.transform(X_submission)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 5. TTS for the training set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(1538, 27)\n",
- "(513, 27)\n"
- ]
- }
- ],
- "source": [
- "from sklearn.model_selection import train_test_split\n",
- "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)\n",
- "print(X_train.shape)\n",
- "print(X_test.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 6. Finally creating my simple MLR and scoring it"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0.8621025361868271\n",
- "181551.05762720952\n",
- "[ 160.42477762 20580.971557 20789.68604804 -15612.58599389\n",
- " -13670.91444598 26332.43145848 10655.46307571 6926.00533633\n",
- " 6998.86563432 455.0580014 6980.23750138 5036.52351063\n",
- " -137.91517397 4770.17283565 9247.00483347 1109.72198759\n",
- " 79.93465596 1307.2350037 7748.79666688 -565.6949233\n",
- " -7884.23462386 -25267.37779968 -29275.44484881 374.55638562\n",
- " 1289.63776226 32298.00649838 -36201.5792096 ]\n"
- ]
- }
- ],
- "source": [
- "from sklearn.linear_model import LinearRegression\n",
- "\n",
- "model = LinearRegression()\n",
- "model.fit(X_train, y_train)\n",
- "print(model.score(X_train, y_train))\n",
- "print(model.intercept_) \n",
- "print(model.coef_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "# checking that the intercept is positive, score is 86% and coefficients make sense"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.8376768479647607"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#cross-validate\n",
- "from sklearn.model_selection import cross_val_score\n",
- "cross_val_score(model, X_train, y_train, cv = 10).mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
- "# cv score is not that far off model performance score\n",
- "# try on the test set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.8754011139284391"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.score(X_test, y_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([124657.83797359, 233764.2025863 , 254367.39261074, 137446.72146039,\n",
- " 197344.58755117, 356142.9112147 , 160137.82757228, 311736.50122965,\n",
- " 171365.29778086, 81264.8676615 , 146448.02916319, 197410.09000769,\n",
- " 164974.86513526, 105107.90282686, 115053.07566765, 88830.51842119,\n",
- " 122484.81538231, 203201.40899428, 318984.54628359, 234744.60911307,\n",
- " 203904.51598581, 200768.48955219, 151210.8844595 , 113697.18153046,\n",
- " 157016.21134263, 237536.76572715, 168149.35330444, 121177.01740145,\n",
- " 247234.82342365, 113923.46942071, 358714.57925481, 107283.99453105,\n",
- " 120302.77111721, 182227.28136153, 198223.88439228, 79387.44304974,\n",
- " 167173.57489099, 231983.24821569, 308283.5710916 , 111798.75256198,\n",
- " 222992.10786668, 196391.3124267 , 292190.11720976, 183132.87943924,\n",
- " 82881.94232308, 173539.89291091, 112472.27437398, 94254.81153454,\n",
- " 293843.66021332, 141219.03807486, 374732.34038708, 63315.95270037,\n",
- " 117642.33594604, 216981.41350565, 153602.196533 , 105973.2243344 ,\n",
- " 242576.68949559, 196898.92493288, 125734.59767359, 172544.01827374,\n",
- " 263429.26662111, 46084.96328723, 181917.2596768 , 234150.21093609,\n",
- " 202473.35734056, 70098.65744871, 258148.03771344, 254956.6309359 ,\n",
- " 281755.32731966, 143632.79215952, 225249.6763857 , 316286.61102583,\n",
- " 277540.00572174, 179746.58404379, 123277.48367933, 209238.50848278,\n",
- " 208893.07067318, 149452.96104988, 160145.75996906, 310821.47641634,\n",
- " 124814.18064332, 119930.03286122, 298633.30575022, 159974.89311164,\n",
- " 112565.03259461, 104685.62948878, 115234.43370453, 227191.27818579,\n",
- " 67401.6298356 , 161964.23372381, 182187.96841228, 181846.58660351,\n",
- " 131156.81486101, 234330.56471751, 141979.15808302, 201345.62656676,\n",
- " 124703.85050689, 104745.83902015, 217374.65681401, 121498.78598703,\n",
- " 137862.68572143, 356166.55766044, 63295.54678353, 201135.7613614 ,\n",
- " 168641.94441875, 133144.99440749, 109045.53821833, 139705.27183219,\n",
- " 166940.16620052, 161602.05938393, 50451.97008826, 184054.20892553,\n",
- " 121585.44712225, 308413.41882712, 104794.47181733, 277226.79016093,\n",
- " 203993.24169746, 140521.16110067, 113620.97036663, 174269.15339756,\n",
- " 201652.81576382, 120965.50521131, 220615.27471873, 139831.5591772 ,\n",
- " 242915.13433874, 221523.61003021, 137969.20848692, 265955.25247273,\n",
- " 210587.74592602, 139611.79978723, 182291.8707243 , 123501.31649194,\n",
- " 159910.92255653, 361761.49946479, 144915.33186961, 173485.84368592,\n",
- " 79049.13663093, 161704.60302244, 187357.28345383, 148421.30784224,\n",
- " 53655.96082153, -5494.02972267, 132003.88862812, 76446.04464881,\n",
- " 245032.33932096, 217115.58037993, 400281.42470578, 184993.93695407,\n",
- " 117908.71208643, 163148.51488006, 80434.05427133, 150812.31577396,\n",
- " 132725.36100823, 295669.28245206, 102020.34214941, 183823.59898147,\n",
- " 219554.36188301, 141665.51668123, 129910.12660383, 188936.66836479,\n",
- " 126353.39607288, 252613.95445249, 135187.44351232, 172409.41337177,\n",
- " 81912.89760925, 198004.96484163, 127432.98473552, 200152.53554167,\n",
- " 122913.46102595, 307690.18408268, 223042.9387275 , 134024.68885368,\n",
- " 136343.05684234, 198102.27535371, 182963.36111638, 135158.3046069 ,\n",
- " 282506.20836892, 116962.59954744, 124452.1482912 , 102220.96386219,\n",
- " 215549.89173431, 151077.46694906, 162028.18339559, 202009.5716362 ,\n",
- " 123780.98037663, 160422.06139329, 159834.41266095, 129550.59302622,\n",
- " 151958.22182078, 175214.70195228, 220805.44568633, 119093.55697729,\n",
- " 147459.9021361 , 23609.41451905, 206232.16485731, 169958.85723586,\n",
- " 84108.69067598, 322123.75160603, 185379.76192374, 255047.21749802,\n",
- " 180369.6395729 , 109755.67100029, 177718.22207938, 148785.99493586,\n",
- " 120041.81151044, 110461.19337785, 194281.44626587, 196428.15820093,\n",
- " 140656.97276161, 246808.52122294, 121133.89332516, 407042.70610713,\n",
- " 144656.10321034, 158689.11477807, 359043.88653588, 203300.26657901,\n",
- " 264256.77604319, 169661.26950468, 283831.97819516, 135172.62912717,\n",
- " 98967.84685143, 163140.81126162, 161068.24102463, 187372.45153675,\n",
- " 95901.2397569 , 237899.05281083, 212137.08791207, 111953.3818717 ,\n",
- " 90941.53272891, 212985.61718494, 127565.71705964, 94662.37841506,\n",
- " 176632.80376506, 159025.63087976, 162651.5729982 , 83855.3390145 ,\n",
- " 201481.49137122, 59720.98812191, 146307.75108669, 180690.77135571,\n",
- " 235191.48004871, 147205.54297057, 112369.91654491, 163362.66128596,\n",
- " 136313.12194262, 190827.27540306, 358022.10950645, 194416.4327504 ,\n",
- " 172418.59397773, 259408.51228121, 225725.5972678 , 358190.43300976,\n",
- " 208473.9523822 , 199513.13762309, 410391.87222628, 143984.50096201,\n",
- " 180935.36566372, 100651.42058065, 380801.33374619, 218441.62475428,\n",
- " 305127.45914049, 83098.73647553, 196075.25203208, 176903.23828028,\n",
- " 355781.89304333, 160576.06414294, 217801.14308403, 213133.02868385,\n",
- " 119662.88697066, 330390.18104478, 175760.70505834, 157059.9499516 ,\n",
- " 77351.47976046, 309710.80339115, 170985.27305021, 338255.70285471,\n",
- " 305977.81198948, 197530.98359305, 182573.17908066, 154060.11692098,\n",
- " 91407.32405395, 186289.44118977, 186364.91798186, 185521.92932942,\n",
- " 11688.18069185, 273377.41157666, 77881.04295746, 95141.91316589,\n",
- " 115646.37905933, 145469.8770625 , 192182.46825271, 176464.06055369,\n",
- " 131750.63079031, 176466.27269551, 173174.26967938, 174173.18078469,\n",
- " 88650.36537463, 99808.48717825, 177221.60426115, 208911.70144379,\n",
- " 132097.5268803 , 317432.60793305, 246741.64119622, 182440.25193786,\n",
- " 178446.40349208, 237073.77070552, 153726.94035933, 169368.43007245,\n",
- " 187203.42504016, 182045.00723434, 124699.10992039, 96626.0860931 ,\n",
- " 188826.09692933, 136305.79087987, 142716.0901991 , 220400.06357049,\n",
- " 164012.33608787, 111708.77908851, 285375.66554193, 171888.87651805,\n",
- " 191666.39250057, 121541.27948664, 157468.53085798, 154957.24524095,\n",
- " 240766.14744621, 107713.43463302, 149108.13023492, 190147.14575301,\n",
- " 253279.75926354, 58768.52927957, 137823.29878865, 163557.87310245,\n",
- " 214792.28347583, 248569.61140424, 123089.96907005, 118542.61254051,\n",
- " 380241.32901023, 185467.89902433, 320531.71092592, 198141.80362753,\n",
- " 198478.11584753, 146896.80034552, 173912.05934481, 270442.49437023,\n",
- " 205490.64010563, 188825.52654707, 133233.17958404, 253667.87487414,\n",
- " 89889.93482285, 83017.36520753, 78125.93704976, 119300.54912959,\n",
- " 120814.76818627, 334275.0251345 , 304585.77697805, 169732.91822669,\n",
- " 375399.67570528, 252959.94782699, 7635.89020229, 180708.61211982,\n",
- " 196216.79217908, 361648.37702624, 162202.88759165, 151973.58931809,\n",
- " 205228.43938192, 215215.48738271, 228245.13968458, 192092.6612177 ,\n",
- " 224009.87177872, 265939.5682791 , 124190.16258513, 124535.06222545,\n",
- " 213891.57757138, 200580.94266151, 129701.87010673, 328969.92529661,\n",
- " 127616.29057409, 178214.89889451, 192128.31327859, 163944.55676737,\n",
- " 81336.31896423, 209171.32599572, 306959.92605085, 208429.7469269 ,\n",
- " 229702.14985188, 102093.12400235, 155852.24499003, 224425.23663134,\n",
- " 148800.15785812, 135990.02609349, 137295.46252351, 326361.49966217,\n",
- " 184541.98207679, 208113.3729331 , 265762.79991441, 175335.96591688,\n",
- " 161364.22046569, 284695.7522194 , 199073.11908501, 105105.36600987,\n",
- " 133141.62368594, 143566.77466175, 180385.8318132 , 194936.49291044,\n",
- " 192139.38399504, 201465.39686604, 202543.08087238, 25075.3506406 ,\n",
- " 257240.4327088 , 307277.20859791, 335738.22106316, 206784.47480231,\n",
- " 282854.09966046, 203055.34076483, 267080.78533884, 180607.57566504,\n",
- " 188924.40247689, 109301.77006503, 179538.35758517, 113185.57075429,\n",
- " 350668.85008696, 233047.85659837, 125049.58259053, 222748.91077882,\n",
- " 95599.42593969, 229747.3942105 , 103208.53186525, 139180.48780632,\n",
- " 309535.18194862, 239868.32094768, 262238.05780108, 251068.20766102,\n",
- " 71520.8631212 , 188762.94030874, 135854.58481016, 142700.35284738,\n",
- " 267882.00135667, 279267.46153769, 229349.55051675, 107359.92894866,\n",
- " 139821.99885774, 120275.90022229, 130280.15194458, 104826.70854003,\n",
- " 293208.08107508, 291146.81738048, 163790.90047027, 200037.77082319,\n",
- " 220459.22009246, 277315.09856244, 233455.12131475, 128736.50316072,\n",
- " 164521.2846689 , 186818.26505066, 158487.37704472, 65388.43056146,\n",
- " 469036.96918948, 287269.24357311, 315944.80979861, 289266.57784408,\n",
- " 193073.15654037, 126619.28261932, 228147.83723082, 200193.46479468,\n",
- " 167159.89921329, 166690.85749096, 109545.37760615, 208812.27796227,\n",
- " 204473.72375096, 168482.30360486, 217599.00367594, 73144.11793953,\n",
- " 379001.42098757, 128492.60965501, 129027.037483 , 352752.15673675,\n",
- " 261622.0204418 , 85981.04768673, 133950.42964152, 221298.18375014,\n",
- " 120078.2647881 , 100429.18990959, 198981.37985234, 110961.25478684,\n",
- " 136870.46673106, 346216.39487824, 125569.2921395 , 303893.79354058,\n",
- " 200505.30482216, 211505.39188718, 148682.40961178, 171772.81401003,\n",
- " 226517.67381888, 125683.24241152, 123194.39793531, 169408.23175902,\n",
- " 137911.5974203 , 79175.92823114, 185712.94482788, 186758.37651702,\n",
- " 130331.55358299, 209377.72141439, 254721.04534741, 152905.66997492,\n",
- " 128570.05153632, 214707.4023475 , 312380.11503756, 140472.52913344,\n",
- " 163913.36638207, 89084.80433478, 144439.61909276, 126477.04034433,\n",
- " 250156.73291305])"
- ]
- },
- "execution_count": 55,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#making the predictions on my training test set to make sure the predictions actually make sense (not negative, for ex)\n",
- "model.predict(X_test)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 7. Applying the model to the kaggle test set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [],
- "source": [
- "y_submission = model.predict(X_submission)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([152491.21191136, 161327.4366651 , 205327.7427502 , 106147.05857414,\n",
- " 179755.97594702])"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "y_submission[0:5] #making sure that the results make sense"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 8. Checking for column mismatch and fixing it"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "##### Credit goes to Ki-Hoon, who basically gave me the code."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['SalePrice']"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "[col for col in newdf.columns if col not in submission.columns]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Kitchen Qual_Po']"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "[col for col in submission.columns if col not in newdf.columns]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [],
- "source": [
- "all_cols = newdf.columns.union(submission.columns)\n",
- "\n",
- "newdf = newdf.assign(**{col:0 for col in all_cols.difference(newdf.columns).tolist()})\n",
- "submission = submission.assign(**{col:0 for col in all_cols.difference(submission.columns).tolist()})"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### Confirming that columns now match, and putting them in the same order\n",
- "### Ben's code could be useful here..."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "((2051, 29),\n",
- " Index(['Id', 'SalePrice', 'Overall Qual', 'Gr Liv Area', 'Garage Area',\n",
- " 'Garage Cars', 'Total Bsmt SF', '1st Flr SF', 'Year Built',\n",
- " 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area',\n",
- " 'TotRms AbvGrd', 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage',\n",
- " 'Open Porch SF', 'Wood Deck SF', 'Lot Area', 'Central Air_Y',\n",
- " 'Kitchen Qual_Fa', 'Kitchen Qual_Gd', 'Kitchen Qual_TA',\n",
- " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft', 'Kitchen Qual_Po'],\n",
- " dtype='object'))"
- ]
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "newdf.shape, newdf.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "((879, 29),\n",
- " Index(['Id', 'Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars',\n",
- " 'Total Bsmt SF', '1st Flr SF', 'Year Built', 'Year Remod/Add',\n",
- " 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area', 'TotRms AbvGrd',\n",
- " 'Fireplaces', 'BsmtFin SF 1', 'Lot Frontage', 'Open Porch SF',\n",
- " 'Wood Deck SF', 'Lot Area', 'Central Air_Y', 'Kitchen Qual_Fa',\n",
- " 'Kitchen Qual_Gd', 'Kitchen Qual_Po', 'Kitchen Qual_TA',\n",
- " 'Paved Drive_P', 'Paved Drive_Y', 'garage', 'Sq Ft', 'SalePrice'],\n",
- " dtype='object'))"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "submission.shape, submission.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(879, 29)"
- ]
- },
- "execution_count": 43,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "submission.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission = submission[newdf.columns]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [],
- "source": [
- "assert (submission.columns == newdf.columns).all().all()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Id</th>\n",
- " <th>SalePrice</th>\n",
- " <th>Overall Qual</th>\n",
- " <th>Gr Liv Area</th>\n",
- " <th>Garage Area</th>\n",
- " <th>Garage Cars</th>\n",
- " <th>Total Bsmt SF</th>\n",
- " <th>1st Flr SF</th>\n",
- " <th>Year Built</th>\n",
- " <th>Year Remod/Add</th>\n",
- " <th>...</th>\n",
- " <th>Lot Area</th>\n",
- " <th>Central Air_Y</th>\n",
- " <th>Kitchen Qual_Fa</th>\n",
- " <th>Kitchen Qual_Gd</th>\n",
- " <th>Kitchen Qual_TA</th>\n",
- " <th>Paved Drive_P</th>\n",
- " <th>Paved Drive_Y</th>\n",
- " <th>garage</th>\n",
- " <th>Sq Ft</th>\n",
- " <th>Kitchen Qual_Po</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>109</td>\n",
- " <td>130500</td>\n",
- " <td>6</td>\n",
- " <td>1479</td>\n",
- " <td>475</td>\n",
- " <td>2.0</td>\n",
- " <td>725</td>\n",
- " <td>725</td>\n",
- " <td>1976</td>\n",
- " <td>2005</td>\n",
- " <td>...</td>\n",
- " <td>13517</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>950.0</td>\n",
- " <td>525625</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>1 rows Γ 29 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " Id SalePrice Overall Qual Gr Liv Area Garage Area Garage Cars \\\n",
- "0 109 130500 6 1479 475 2.0 \n",
- "\n",
- " Total Bsmt SF 1st Flr SF Year Built Year Remod/Add ... \\\n",
- "0 725 725 1976 2005 ... \n",
- "\n",
- " Lot Area Central Air_Y Kitchen Qual_Fa Kitchen Qual_Gd Kitchen Qual_TA \\\n",
- "0 13517 1 0 1 0 \n",
- "\n",
- " Paved Drive_P Paved Drive_Y garage Sq Ft Kitchen Qual_Po \n",
- "0 0 1 950.0 525625 0 \n",
- "\n",
- "[1 rows x 29 columns]"
- ]
- },
- "execution_count": 46,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "newdf.head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Id</th>\n",
- " <th>SalePrice</th>\n",
- " <th>Overall Qual</th>\n",
- " <th>Gr Liv Area</th>\n",
- " <th>Garage Area</th>\n",
- " <th>Garage Cars</th>\n",
- " <th>Total Bsmt SF</th>\n",
- " <th>1st Flr SF</th>\n",
- " <th>Year Built</th>\n",
- " <th>Year Remod/Add</th>\n",
- " <th>...</th>\n",
- " <th>Lot Area</th>\n",
- " <th>Central Air_Y</th>\n",
- " <th>Kitchen Qual_Fa</th>\n",
- " <th>Kitchen Qual_Gd</th>\n",
- " <th>Kitchen Qual_TA</th>\n",
- " <th>Paved Drive_P</th>\n",
- " <th>Paved Drive_Y</th>\n",
- " <th>garage</th>\n",
- " <th>Sq Ft</th>\n",
- " <th>Kitchen Qual_Po</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>2658</td>\n",
- " <td>0</td>\n",
- " <td>6</td>\n",
- " <td>1928</td>\n",
- " <td>440</td>\n",
- " <td>1.0</td>\n",
- " <td>1020</td>\n",
- " <td>908</td>\n",
- " <td>1910</td>\n",
- " <td>1950</td>\n",
- " <td>...</td>\n",
- " <td>9142</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>440.0</td>\n",
- " <td>926160</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>1 rows Γ 29 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " Id SalePrice Overall Qual Gr Liv Area Garage Area Garage Cars \\\n",
- "0 2658 0 6 1928 440 1.0 \n",
- "\n",
- " Total Bsmt SF 1st Flr SF Year Built Year Remod/Add ... \\\n",
- "0 1020 908 1910 1950 ... \n",
- "\n",
- " Lot Area Central Air_Y Kitchen Qual_Fa Kitchen Qual_Gd Kitchen Qual_TA \\\n",
- "0 9142 0 1 0 0 \n",
- "\n",
- " Paved Drive_P Paved Drive_Y garage Sq Ft Kitchen Qual_Po \n",
- "0 0 1 440.0 926160 0 \n",
- "\n",
- "[1 rows x 29 columns]"
- ]
- },
- "execution_count": 47,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "submission.head(1)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 9. Adding the predicted SalesPrices to the submission set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission['SalePrice'] = y_submission"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 10. Subsetting the two required columns. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission1 = submission[['Id','SalePrice']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>Id</th>\n",
- " <th>SalePrice</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>2658</td>\n",
- " <td>152491.211911</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " Id SalePrice\n",
- "0 2658 152491.211911"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "submission1.head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "# I have the index column (so it's 3 columns instead of 2)\n",
- "# I know there is a way to just drop index while saving to csv, but I couldn't figure out how\n",
- "# so dropping it by hand"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {},
- "outputs": [],
- "source": [
- "submission1.set_index('Id', inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>SalePrice</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>Id</th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>2658</th>\n",
- " <td>152491.211911</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " SalePrice\n",
- "Id \n",
- "2658 152491.211911"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "submission1.head(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {},
- "outputs": [],
- "source": [
- "#submission1.to_csv('submission3.csv')"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python [conda env:dsi]",
- "language": "python",
- "name": "conda-env-dsi-py"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment