Untitled

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import LabelEncoder
# from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
# from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import RobustScaler
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
# from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn.metrics import mean_squared_error
# import xgboost as xgb
# import lightgbm as lgb
# import warnings
# try plotly

#PART 0
#IMPORTING SETS

#warnings.filterwarnings('ignore')
# %matplotlib inline

pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x))
# 4 decimal points, might give better results with more decimals

train = pd.read_csv('kaggle/train.csv')
test = pd.read_csv('kaggle/test.csv')
trainRes = train
testRes = test

# otstranuvanje na id vrednosta bidejki e nepotrebna
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)


                                                                                        # PART 1
                                                                                        # ANALYSIS OF SALEPRICE


                                                                        # plotting distribution of SalePrice
# print(train['SalePrice'].describe())
# sns.distplot(train['SalePrice'])
# plt.show()

                                                                        # iskrivenost i kurtosis FIX NAMES
# print("Skewness: %f" % train['SalePrice'].skew())
# print("Kurtosis: %f" % train['SalePrice'].kurt())

                                                                        # plot compared to normal distribution
# sns.distplot(train['SalePrice'], fit=norm)
# plt.show()

                                                                        # QQ plot
# fig = plt.figure()
# res=stats.probplot(train['SalePrice'], plot=plt)
# plt.show()

                                                                        # Logarithmic normalization of att. SalePrice
# train['SalePrice'] = np.log1p(train['SalePrice'])

                                                                        # plot of log normalized SalePrice against normal dist.
# sns.distplot(train['SalePrice'], fit=norm)
# plt.show()

                                                                        # QQ plot of log normalized SalePrice against normal dist.
# fig = plt.figure()
# res=stats.probplot(train['SalePrice'], plot=plt)
# plt.show()


                                                                                        # PART 2
                                                                                        # DATA CLEANING


                                                                        # printing a frame with attributes with most missing values and
                                                                        # percentages of incompleteness
# missingData = (train.isnull().sum() / len (train)) * 100
# missingData = missingData.drop(missingData[missingData==0].index)\
#                             .sort_values(ascending = False)[:30]
# missingDataFrame = pd.DataFrame({'Percent Missing' :missingData})
# print(missingDataFrame)


                                                                        # eliminating missing values where NA means not present i.e. NONE
# train['PoolQC'] = train['PoolQC'].fillna("None")
# train['MiscFeature'] = train['MiscFeature'].fillna("None")
# train['Alley'] = train['Alley'].fillna('None')
# train['Fence'] = train['Fence'].fillna('None')
# train['FireplaceQu'] = train['FireplaceQu'].fillna('None')
#
# train["LotFrintage"] = train.groupby('Neighborhood')['LotFrotage']\
#                          .transform(lambda x: x.fillna(x.median()))

                                                                        # fill BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF,
                                                                        # BsmtFullBath and BsmtHalfBath with 0 where basement is not present #SR4%
                                                                        # check for garage, basement sr4%
                                                                        # check for MSZoning in case of merging train and test

# train['MasVnrType'] = train['MasVnrType'].fillna('None')
# train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
                                                                        # same percentage of missing values for MasVnr[Type,Area] so most likely
                                                                        # they are the same records. Also most likely having no value means not having
                                                                        # a masonry veneer
                                                                        # #PROBAJ SO I BEZ I ZAPISHI VO DOKUMENTACIJA


# for att in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
#     train[att] = train[att].fillna('None')
# for att in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
#     train[att] = train[att].fillna(0)
                                                                        # these are most likely for having zero for numerical and none for categorical att's
                                                                        # the assuredness of them being zero and None comes from the same percentage
                                                                        # of missing values which implies that most likely it is the same set of
                                                                        # records in which those values are not present.

# for att in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
#             'TotalBsmtSF'): #'BsmtFullBath', 'BsmtHalfBath'):
#     train[att] = train[att].fillna(0)
#
# for att in ('BsmtQual', 'BsmtCond', 'BsmtExposure',
#             'BsmtFinType1', 'BsmtFinType2'):
#     train[att] = train[att].fillna('None')
                                                                        # same goes for basement related missing values

# train['Electrical'] = train['Electrical'].fillna(all_data['Electrical'].mode()[0])
                                                                        # since there is only an extremely portion of electrical values that are missing
                                                                        # the mean of all values should be appropriate
                                                                        # STILL, CHECK FOR SOME OTHER WAY OF INFERRING THIS VALUE

# attributesEncode = ('BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
#         'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
#         'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
#         'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
#         'YrSold', 'MoSold''FireplaceQu', ) # MIX UP VARIABLES, ILI PODOBRO PO RED OD DESCRIPTION
# for att in attributesEncode:
#     lbl = LabelEncoder()
#     lbl.fit(list(train[att].values))
#     train[att] = lbl.transform(list(train[att].values))
                                                                        # obvious categorical attributes which should be numerical
                                                                        # thus, apply label encoding

                                                                        # ADD TOTAL SQFOOTAGE


                                                                                        # PART 3
                                                                                        # CORRELATION OF SALEPRICE TO OTHER ATTRIBUTES


                                                                        # correlation matrix represented through heatmap
# corrmat = train.corr()
# fig, ax = plt.subplots(figsize= (12, 9))
# sns.heatmap(corrmat, vmax= .8, square= True)
# #plt.show()

                                                                        # heatmap of most colerrated att's to SalePrice
# cols = corrmat.nlargest(12, 'SalePrice')['SalePrice'].index
# corrmat10 = np.corrcoef(train[cols].values.T)  # CHECK FOR MEANING OF T
# hm10 = sns.heatmap(corrmat10, annot=True,
#                    square=True, fmt='.2f', annot_kws={'size': 12},
#                    yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

sns.set()
atts = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']
sns.pairplot(train[atts], size = 2.5)
plt.show()

sns.set()
atts = ['TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',\
        'YearRemodAdd']
sns.pairplot(train[atts], size = 2)
plt.show()


                                                                        # scatter plot of SalePrice and GrLivArea
# fig, ax = plt.subplots()
# ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
# plt.xlabel('GrLivArea')
# plt.ylabel('SalePrice')
# plt.show()

                                                                        #scatterplot of SalePrice and TotalBsmtSF
# fig, ax = plt.subplots()
# ax.scatter(x = train['TotalBsmtSF'], y = train['SalePrice'])
# plt.xlabel('GrLivArea')
# plt.ylabel('SalePrice')
# plt.show()