a guest Dec 16th, 2018
1. import numpy as np
2. import matplotlib.pyplot as plt
3. import pandas as pd
4. import seaborn as sns
6. from sklearn.model_selection import train_test_split
7.
9.
12. print(boston_dataset.keys())
13. bos = pd.DataFrame(boston_dataset.data)
14. bos.columns = boston_dataset.feature_names
15. bos['MEDV'] = boston_dataset.target
17. print(bos.tail(10))
18.
21. print(bos.info(verbose=True))
22.
25. print(bos.describe())
26. # a) 3.613524 i 8.601545
27. # b) 5.000000 i 50.000000
28. # c) 11.360000
29.
31. sns.set(rc={'figure.figsize':(11.7,8.27)})
32. sns.distplot(bos['MEDV'], bins=30)
33. plt.show()
34.
36. correlation_matrix = bos.corr().round(2)
37. sns.heatmap(data=correlation_matrix, annot=True)
38. plt.show()
39. # a) RM - liczba pokoi
40. # b) LSTAT - % ludzi o nizszym stanie
42.
43. # dodanio skolerowane - RM
44. ax1 = sns.regplot(x=bos['MEDV'], y=bos['RM'], data=boston_dataset)
45. plt.show()
46. # ujemnie skolerowane - LSTAT
47. ax2 = sns.regplot(x=bos['MEDV'], y=bos['LSTAT'], data=boston_dataset)
48. plt.show()
49. # najmniej skolerowane - CHAS
50. ax3 = sns.regplot(x=bos['MEDV'], y=bos['CHAS'], data=boston_dataset)
51. plt.show()
52.
55. X = pd.DataFrame(np.c_[bos['LSTAT'], bos['RM']], columns = ['LSTAT','RM'])
56. Y = bos['MEDV']
57.
58. X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
59. print(X_train.shape)
60. print(X_test.shape)
61. print(Y_train.shape)
62. print(Y_test.shape)
63.
65.
66. from sklearn.linear_model import LinearRegression
67.
68. lm = LinearRegression()
69. lm.fit(X_train, Y_train)
70.
71. #a
72. Y_train_predict = lm.predict(X_train)
73. plt.scatter(Y_train, Y_train_predict)
74. plt.xlabel("Train Prices: $Y_i$")
75. plt.ylabel("Predicted train prices: $\hat{Y}_i$")
76. plt.title("Train rices vs Predicted train prices: $Y_i$ vs $\hat{Y}_i$")
77. plt.show()
78.
79. #b
80. Y_test_predict = lm.predict(X_test)
81. plt.scatter(Y_test, Y_test_predict)
82. plt.xlabel("Test prices: $Y_i$")
83. plt.ylabel("Predicted test prices: $\hat{Y}_i$")
84. plt.title("Test prices vs Predicted test prices: $Y_i$ vs $\hat{Y}_i$")
85. plt.show()
86.
88.
89. from sklearn.metrics import mean_absolute_error
90. from sklearn.metrics import mean_squared_error
91.
93. rmse = (np.sqrt(mean_squared_error(Y_train, Y_train_predict)))
94. mae = mean_absolute_error(Y_train, Y_train_predict)
95.
96. print("The model performance for training set")
97. print("--------------------------------------")
98. print('RMSE is {}'.format(rmse))
99. print('MAE  is {}'.format(mae))
100. print("\n")
101.
102. rmse = (np.sqrt(mean_squared_error(Y_test, Y_test_predict)))
103. mae = mean_absolute_error(Y_test, Y_test_predict)
104.
105. print("The model performance for testing set")
106. print("--------------------------------------")
107. print('RMSE is {}'.format(rmse))
108. print('MAE  is {}'.format(mae))
