Advertisement
Guest User

Untitled

a guest
Feb 25th, 2020
256
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.59 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from scipy import stats
  6. from scipy.stats import norm, skew
  7. from sklearn.preprocessing import LabelEncoder
  8. # from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
  9. # from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
  10. # from sklearn.kernel_ridge import KernelRidge
  11. # from sklearn.pipeline import make_pipeline
  12. # from sklearn.preprocessing import RobustScaler
  13. # from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
  14. # from sklearn.model_selection import KFold, cross_val_score, train_test_split
  15. # from sklearn.metrics import mean_squared_error
  16. # import xgboost as xgb
  17. # import lightgbm as lgb
  18. # import warnings
  19. # try plotly
  20.  
  21. #PART 0
  22. #IMPORTING SETS
  23.  
  24. #warnings.filterwarnings('ignore')
  25. # %matplotlib inline
  26.  
  27. pd.set_option('display.float_format', lambda x: '{:.4f}'.format(x))
  28. # 4 decimal points, might give better results with more decimals
  29.  
  30. train = pd.read_csv('kaggle/train.csv')
  31. test = pd.read_csv('kaggle/test.csv')
  32. trainRes = train
  33. testRes = test
  34.  
  35. # otstranuvanje na id vrednosta bidejki e nepotrebna
  36. train.drop("Id", axis=1, inplace=True)
  37. test.drop("Id", axis=1, inplace=True)
  38.  
  39.  
  40. # PART 1
  41. # ANALYSIS OF SALEPRICE
  42.  
  43.  
  44. # plotting distribution of SalePrice
  45. # print(train['SalePrice'].describe())
  46. # sns.distplot(train['SalePrice'])
  47. # plt.show()
  48.  
  49. # iskrivenost i kurtosis FIX NAMES
  50. # print("Skewness: %f" % train['SalePrice'].skew())
  51. # print("Kurtosis: %f" % train['SalePrice'].kurt())
  52.  
  53. # plot compared to normal distribution
  54. # sns.distplot(train['SalePrice'], fit=norm)
  55. # plt.show()
  56.  
  57. # QQ plot
  58. # fig = plt.figure()
  59. # res=stats.probplot(train['SalePrice'], plot=plt)
  60. # plt.show()
  61.  
  62. # Logarithmic normalization of att. SalePrice
  63. # train['SalePrice'] = np.log1p(train['SalePrice'])
  64.  
  65. # plot of log normalized SalePrice against normal dist.
  66. # sns.distplot(train['SalePrice'], fit=norm)
  67. # plt.show()
  68.  
  69. # QQ plot of log normalized SalePrice against normal dist.
  70. # fig = plt.figure()
  71. # res=stats.probplot(train['SalePrice'], plot=plt)
  72. # plt.show()
  73.  
  74.  
  75. # PART 2
  76. # DATA CLEANING
  77.  
  78.  
  79. # printing a frame with attributes with most missing values and
  80. # percentages of incompleteness
  81. # missingData = (train.isnull().sum() / len (train)) * 100
  82. # missingData = missingData.drop(missingData[missingData==0].index)\
  83. # .sort_values(ascending = False)[:30]
  84. # missingDataFrame = pd.DataFrame({'Percent Missing' :missingData})
  85. # print(missingDataFrame)
  86.  
  87.  
  88. # eliminating missing values where NA means not present i.e. NONE
  89. # train['PoolQC'] = train['PoolQC'].fillna("None")
  90. # train['MiscFeature'] = train['MiscFeature'].fillna("None")
  91. # train['Alley'] = train['Alley'].fillna('None')
  92. # train['Fence'] = train['Fence'].fillna('None')
  93. # train['FireplaceQu'] = train['FireplaceQu'].fillna('None')
  94. #
  95. # train["LotFrintage"] = train.groupby('Neighborhood')['LotFrotage']\
  96. # .transform(lambda x: x.fillna(x.median()))
  97.  
  98. # fill BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF,
  99. # BsmtFullBath and BsmtHalfBath with 0 where basement is not present #SR4%
  100. # check for garage, basement sr4%
  101. # check for MSZoning in case of merging train and test
  102.  
  103. # train['MasVnrType'] = train['MasVnrType'].fillna('None')
  104. # train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
  105. # same percentage of missing values for MasVnr[Type,Area] so most likely
  106. # they are the same records. Also most likely having no value means not having
  107. # a masonry veneer
  108. # #PROBAJ SO I BEZ I ZAPISHI VO DOKUMENTACIJA
  109.  
  110.  
  111. # for att in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
  112. # train[att] = train[att].fillna('None')
  113. # for att in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
  114. # train[att] = train[att].fillna(0)
  115. # these are most likely for having zero for numerical and none for categorical att's
  116. # the assuredness of them being zero and None comes from the same percentage
  117. # of missing values which implies that most likely it is the same set of
  118. # records in which those values are not present.
  119.  
  120. # for att in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
  121. # 'TotalBsmtSF'): #'BsmtFullBath', 'BsmtHalfBath'):
  122. # train[att] = train[att].fillna(0)
  123. #
  124. # for att in ('BsmtQual', 'BsmtCond', 'BsmtExposure',
  125. # 'BsmtFinType1', 'BsmtFinType2'):
  126. # train[att] = train[att].fillna('None')
  127. # same goes for basement related missing values
  128.  
  129. # train['Electrical'] = train['Electrical'].fillna(all_data['Electrical'].mode()[0])
  130. # since there is only an extremely portion of electrical values that are missing
  131. # the mean of all values should be appropriate
  132. # STILL, CHECK FOR SOME OTHER WAY OF INFERRING THIS VALUE
  133.  
  134. # attributesEncode = ('BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
  135. # 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
  136. # 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
  137. # 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
  138. # 'YrSold', 'MoSold''FireplaceQu', ) # MIX UP VARIABLES, ILI PODOBRO PO RED OD DESCRIPTION
  139. # for att in attributesEncode:
  140. # lbl = LabelEncoder()
  141. # lbl.fit(list(train[att].values))
  142. # train[att] = lbl.transform(list(train[att].values))
  143. # obvious categorical attributes which should be numerical
  144. # thus, apply label encoding
  145.  
  146. # ADD TOTAL SQFOOTAGE
  147.  
  148.  
  149.  
  150.  
  151.  
  152.  
  153.  
  154.  
  155. # PART 3
  156. # CORRELATION OF SALEPRICE TO OTHER ATTRIBUTES
  157.  
  158.  
  159. # correlation matrix represented through heatmap
  160. # corrmat = train.corr()
  161. # fig, ax = plt.subplots(figsize= (12, 9))
  162. # sns.heatmap(corrmat, vmax= .8, square= True)
  163. # #plt.show()
  164.  
  165. # heatmap of most colerrated att's to SalePrice
  166. # cols = corrmat.nlargest(12, 'SalePrice')['SalePrice'].index
  167. # corrmat10 = np.corrcoef(train[cols].values.T) # CHECK FOR MEANING OF T
  168. # hm10 = sns.heatmap(corrmat10, annot=True,
  169. # square=True, fmt='.2f', annot_kws={'size': 12},
  170. # yticklabels=cols.values, xticklabels=cols.values)
  171. # plt.show()
  172.  
  173. sns.set()
  174. atts = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']
  175. sns.pairplot(train[atts], size = 2.5)
  176. plt.show()
  177.  
  178. sns.set()
  179. atts = ['TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',\
  180. 'YearRemodAdd']
  181. sns.pairplot(train[atts], size = 2)
  182. plt.show()
  183.  
  184.  
  185.  
  186. # scatter plot of SalePrice and GrLivArea
  187. # fig, ax = plt.subplots()
  188. # ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
  189. # plt.xlabel('GrLivArea')
  190. # plt.ylabel('SalePrice')
  191. # plt.show()
  192.  
  193. #scatterplot of SalePrice and TotalBsmtSF
  194. # fig, ax = plt.subplots()
  195. # ax.scatter(x = train['TotalBsmtSF'], y = train['SalePrice'])
  196. # plt.xlabel('GrLivArea')
  197. # plt.ylabel('SalePrice')
  198. # plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement