Houses' SalePrice Regressor

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 1a. Read the dataset
houses_path = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'
houses = pd.read_csv(houses_path)
print(houses.head(), '\n\n')

# 1b. Explore the dataset
shape = houses.shape
print(f"---- Dataset contains {shape[0]} rows-records with {shape[1]} columns-features ----")
all_columns = list(houses.columns)
print("Columns names:", all_columns)
print('\n\n')

# 1c. Separate the target ('SalePrice') from the other features
target = 'SalePrice'
houses.dropna(axis=0, subset=[target], inplace=True)
y = houses[target]
houses.drop([target], axis=1, inplace=True)
shape = houses.shape
print(f"Now, there are {len(list(houses.columns))} columns (target = 'SalePrice' was removed) in the dataset")


# 2a. Check for columns with missing data
# print("...Check for any columns with missing values....")
cols_with_missing = [col for col in houses.columns if houses[col].isnull().any()]
print(f"There are {len(cols_with_missing)}/{shape[1]} columns WITH MISSING VALUES:\n {cols_with_missing} \n\n")

# 2b. Find all the numerical cols AND categorical cols with low cardinality
numerical_cols = [cname for cname in houses.columns if houses[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in houses.columns if houses[cname].dtype == "object" and houses[cname].nunique() < 5]
my_cols = numerical_cols + categorical_cols

print(f"There are {len(numerical_cols)}/{shape[1]} NUMERICAL columns:")
print("Numerical cols =", numerical_cols, '\n\n')
print(f"There are {len(categorical_cols)}/{shape[1]} CATEGORICAL columns:")
print("Categorical cols =", categorical_cols, '\n\n')
print(f"I will SELECT {len(my_cols)}/{shape[1]} columns:")
print("My columns =", my_cols, '\n\n')
# print(f"Selected columns = {len(my_cols)} = {len(numerical_cols)} numerical + {len(categorical_cols)} categorical \n\n")

# 2c. Find out if there are cols with missing data in my selected columns (my_cols)
selected_with_missing = [col for col in my_cols if col in cols_with_missing]
numerical_with_missing = [col for col in numerical_cols if col in cols_with_missing]
categorical_with_missing = [col for col in categorical_cols if col in cols_with_missing]

print("There are {}/{} SELECTED columns WITH MISSING DATA: {}".format(len(selected_with_missing), len(my_cols), selected_with_missing))
print(f"= {len(numerical_with_missing)} numerical with missing data ---> {numerical_with_missing}")
print(f"+ {len(categorical_with_missing)} categorical with missing data ---> {categorical_with_missing}")
print("...Need imputation...\n\n\n")

print("******** SUMMARY ********")
print(f"Selected columns = {len(my_cols)}/{shape[1]}")
print(f"Numerical columns = {len(numerical_cols)}/{len(my_cols)}")
print(f"Categorical columns = {len(categorical_cols)}/{len(my_cols)}")
print(f"Selected cols with missing data = {len(selected_with_missing)}/{len(my_cols)}")
print(f"Numerical with missing data = {len(numerical_with_missing)}/{len(selected_with_missing)}")
print(f"Categorical with missing data = {len(categorical_with_missing)}/{len(selected_with_missing)}")


# 3a. Work with the selected columns of the dataset (my_cols)
X = houses[my_cols].copy()
y = y.copy()

SELECTED = 1000
if SELECTED < shape[0]:
    X = X.head(SELECTED)
    y = y.head(SELECTED)

# 3b. Train, test, split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
print("X_train = {}, X_valid = {}\ny_train = {}, y_valid = {}".format(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape))


# 4a. Imputer + OH encoder, since there are missing values and categorical columns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])


# 5. Model = Regressor (regr) ---> Pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

regr = RandomForestRegressor(n_estimators=100, random_state=0)
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr), ('normalizer', StandardScaler()) ])
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
pipeline.steps


# 6. Evaluate the RandomForestRegressor

from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)
validation_shape = X_valid.shape
avg_y_valid = y_valid.mean()
avg_error = mae / avg_y_valid

print(f"Validation dataset shape = {validation_shape[0]} rows x {validation_shape[1]} columns")
print(f"MAE with RFR = {mae}\n")
print(f"Average SalePrice in validation dataset = {avg_y_valid}, so avg_error = {100 * avg_error}%")


# 7. Evaluate with cross validation

from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
score = scores.mean()

print("Average MAE with RFR =", score, '\n\n')


# 8. Some more regressors

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from  sklearn.linear_model import LogisticRegression

"""
# 8a. Polynomial Regressor
# Create polynomial features (degree=3)
poly_features = PolynomialFeatures(degree=3)
X_poly = poly_features.fit_transform(X_train)
# Create and fit the polynomial regression model
regr1 = LinearRegression()
regr1.fit(X_poly, y_train)
"""
# 8a. Support Vector Machine Regressors
regr1 = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.01)
# 8b. Support Vector Machine Regressors
regr2 = SVR(kernel='rbf', C=1e3, gamma='auto', epsilon=0.1)
# 8c. Decision Tree Regressor
regr3 = DecisionTreeRegressor()
# 8d. Random Forest Regressor
regr4 = RandomForestRegressor()
# 8e. Logistic Regressor
# regr5 = LogisticRegression(random_state=16)

regrs = [regr1, regr2, regr3, regr4]
scores_list = []

for regr in regrs:
    pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
    scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    score = scores.mean()
    scores_list.append(score)

    print('-----------------------------------------')
    print("{}. ".format(str(regr)))
    print("Score =", score)
    print('-------------------------------------------', '\n\n')


# 9. Summary of regressors - pipelines that I tried
best_regr = None
best_score = 10**20
print("After cross-validation with 5-folds:")

for i in range(len(regrs)):
    regr = regrs[i]
    score = scores_list[i]
    print("{} ---> {}".format(regr, score))
    if score < best_score:
        best_score = score
        best_regr = regr

print("\n\n", "Best regressor = {} with average MAE  = {}".format(best_regr, best_score))


# 10. Select the best regressor and evaluate it
# Since RFR is the best one, I can change the default values to find a better model

n_estimators_list = list(range(50, 301, 50))
random_state_list = [0, 1]
best_score = 10**20
best_rs = -1
best_ne = -1

for random_state in random_state_list:
    for n_estimators in n_estimators_list:
        regr = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', regr) ])
        scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
        score = scores.mean()

        print('-------------------------------------------------------------')
        print("{}. ".format(str(regr)))
        print("Score =", score)
        print('-------------------------------------------------------------', '\n\n')

        if score < best_score:
            best_score = score
            best_rs = random_state
            best_ne = n_estimators


# 11. One last evaluation
best_regr = RandomForestRegressor(n_estimators=best_ne, random_state=best_rs)
pipeline = Pipeline([ ('preprocessor', preprocessor), ('model', best_regr) ])
best_scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
best_score = best_scores.mean()

print('-------------------------------------------------------------')
print("Best regressor = {} with:".format(str(best_regr)))
print("Best score MAE =", best_score)
print('-------------------------------------------------------------', '\n\n')
print(best_ne, best_rs)


# 12. Check the predicted values with the real ones
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)
comparison = pd.DataFrame({ 'Real values': y_valid, 'Predictions' : preds, 'Absolute Error': abs(y_valid - preds), 'Error (%)':  100*(abs(y_valid - preds) / y_valid) })
print(comparison)