Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy import stats
- from scipy.stats import norm, skew
- from sklearn.preprocessing import LabelEncoder
- # from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
- # from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
- # from sklearn.kernel_ridge import KernelRidge
- # from sklearn.pipeline import make_pipeline
- # from sklearn.preprocessing import RobustScaler
- # from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
- # from sklearn.model_selection import KFold, cross_val_score, train_test_split
- # from sklearn.metrics import mean_squared_error
- # import xgboost as xgb
- # import lightgbm as lgb
- # import warnings
- # try plotly
- #PART 0
- #IMPORTING SETS
- #warnings.filterwarnings('ignore')
- # %matplotlib inline
- pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x))
- # 4 decimal points, might give better results with more decimals
- train = pd.read_csv('kaggle/train.csv')
- test = pd.read_csv('kaggle/test.csv')
- trainRes = train
- testRes = test
- # otstranuvanje na id vrednosta bidejki e nepotrebna
- train.drop("Id", axis=1, inplace=True)
- test.drop("Id", axis=1, inplace=True)
- # PART 1
- # ANALYSIS OF SALEPRICE
- # plotting distribution of SalePrice
- # print(train['SalePrice'].describe())
- # sns.distplot(train['SalePrice'])
- # plt.show()
- # iskrivenost i kurtosis FIX NAMES
- # print("Skewness: %f" % train['SalePrice'].skew())
- # print("Kurtosis: %f" % train['SalePrice'].kurt())
- # plot compared to normal distribution
- # sns.distplot(train['SalePrice'], fit=norm)
- # plt.show()
- # QQ plot
- # fig = plt.figure()
- # res=stats.probplot(train['SalePrice'], plot=plt)
- # plt.show()
- # Logarithmic normalization of att. SalePrice
- # train['SalePrice'] = np.log1p(train['SalePrice'])
- # plot of log normalized SalePrice against normal dist.
- # sns.distplot(train['SalePrice'], fit=norm)
- # plt.show()
- # QQ plot of log normalized SalePrice against normal dist.
- # fig = plt.figure()
- # res=stats.probplot(train['SalePrice'], plot=plt)
- # plt.show()
- # PART 2
- # DATA CLEANING
- # printing a frame with attributes with most missing values and
- # percentages of incompleteness
- # missingData = (train.isnull().sum() / len (train)) * 100
- # missingData = missingData.drop(missingData[missingData==0].index)\
- # .sort_values(ascending = False)[:30]
- # missingDataFrame = pd.DataFrame({'Percent Missing' :missingData})
- # print(missingDataFrame)
- # eliminating missing values where NA means not present i.e. NONE
- # train['PoolQC'] = train['PoolQC'].fillna("None")
- # train['MiscFeature'] = train['MiscFeature'].fillna("None")
- # train['Alley'] = train['Alley'].fillna('None')
- # train['Fence'] = train['Fence'].fillna('None')
- # train['FireplaceQu'] = train['FireplaceQu'].fillna('None')
- #
- # train["LotFrintage"] = train.groupby('Neighborhood')['LotFrotage']\
- # .transform(lambda x: x.fillna(x.median()))
- # fill BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF,
- # BsmtFullBath and BsmtHalfBath with 0 where basement is not present #SR4%
- # check for garage, basement sr4%
- # check for MSZoning in case of merging train and test
- # train['MasVnrType'] = train['MasVnrType'].fillna('None')
- # train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
- # same percentage of missing values for MasVnr[Type,Area] so most likely
- # they are the same records. Also most likely having no value means not having
- # a masonry veneer
- # #PROBAJ SO I BEZ I ZAPISHI VO DOKUMENTACIJA
- # for att in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
- # train[att] = train[att].fillna('None')
- # for att in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
- # train[att] = train[att].fillna(0)
- # these are most likely for having zero for numerical and none for categorical att's
- # the assuredness of them being zero and None comes from the same percentage
- # of missing values which implies that most likely it is the same set of
- # records in which those values are not present.
- # for att in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
- # 'TotalBsmtSF'): #'BsmtFullBath', 'BsmtHalfBath'):
- # train[att] = train[att].fillna(0)
- #
- # for att in ('BsmtQual', 'BsmtCond', 'BsmtExposure',
- # 'BsmtFinType1', 'BsmtFinType2'):
- # train[att] = train[att].fillna('None')
- # same goes for basement related missing values
- # train['Electrical'] = train['Electrical'].fillna(all_data['Electrical'].mode()[0])
- # since there is only an extremely portion of electrical values that are missing
- # the mean of all values should be appropriate
- # STILL, CHECK FOR SOME OTHER WAY OF INFERRING THIS VALUE
- # attributesEncode = ('BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
- # 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
- # 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
- # 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
- # 'YrSold', 'MoSold''FireplaceQu', ) # MIX UP VARIABLES, ILI PODOBRO PO RED OD DESCRIPTION
- # for att in attributesEncode:
- # lbl = LabelEncoder()
- # lbl.fit(list(train[att].values))
- # train[att] = lbl.transform(list(train[att].values))
- # obvious categorical attributes which should be numerical
- # thus, apply label encoding
- # ADD TOTAL SQFOOTAGE
- # PART 3
- # CORRELATION OF SALEPRICE TO OTHER ATTRIBUTES
- # correlation matrix represented through heatmap
- # corrmat = train.corr()
- # fig, ax = plt.subplots(figsize= (12, 9))
- # sns.heatmap(corrmat, vmax= .8, square= True)
- # #plt.show()
- # heatmap of most colerrated att's to SalePrice
- # cols = corrmat.nlargest(12, 'SalePrice')['SalePrice'].index
- # corrmat10 = np.corrcoef(train[cols].values.T) # CHECK FOR MEANING OF T
- # hm10 = sns.heatmap(corrmat10, annot=True,
- # square=True, fmt='.2f', annot_kws={'size': 12},
- # yticklabels=cols.values, xticklabels=cols.values)
- # plt.show()
- sns.set()
- atts = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']
- sns.pairplot(train[atts], size = 2.5)
- plt.show()
- sns.set()
- atts = ['TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',\
- 'YearRemodAdd']
- sns.pairplot(train[atts], size = 2)
- plt.show()
- # scatter plot of SalePrice and GrLivArea
- # fig, ax = plt.subplots()
- # ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
- # plt.xlabel('GrLivArea')
- # plt.ylabel('SalePrice')
- # plt.show()
- #scatterplot of SalePrice and TotalBsmtSF
- # fig, ax = plt.subplots()
- # ax.scatter(x = train['TotalBsmtSF'], y = train['SalePrice'])
- # plt.xlabel('GrLivArea')
- # plt.ylabel('SalePrice')
- # plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement