Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import matplotlib.pyplot as plt
- from data import get_data, inspect_data, split_data
- data = get_data()
- inspect_data(data)
- train_data, test_data = split_data(data)
- # Simple Linear Regression
- # predict MPG (y, dependent variable) using Weight (x, independent variable) using closed-form solution
- # y = theta_0 + theta_1 * x - we want to find theta_0 and theta_1 parameters that minimize the prediction error
- # We can calculate the error using MSE metric:
- # MSE = SUM (from i=1 to n) (actual_output - predicted_output) ** 2
- # get the columns
- y_train = train_data['MPG'].to_numpy().reshape(-1, 1)
- x_train = train_data['Weight'].to_numpy().reshape(-1, 1)
- y_test = test_data['MPG'].to_numpy().reshape(-1, 1)
- x_test = test_data['Weight'].to_numpy().reshape(-1, 1)
- # TODO: calculate closed-form solution
- theta_best = [0, 0]
- X_b = np.c_[np.ones((x_train.shape[0], 1)), x_train]
- theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train
- print(f"theta best: ${theta_best}")
- print(f"X_b.shape: ${X_b.shape}")
- # TODO: calculate error
- predicted = X_b @ theta_best
- MSE_best = np.mean((y_train - predicted) ** 2)
- print(f"MSE best: ${MSE_best}")
- # plot the regression line
- x = np.linspace(min(x_test), max(x_test), 100)
- y = float(theta_best[0]) + float(theta_best[1]) * x
- plt.plot(x, y)
- plt.scatter(x_test, y_test)
- plt.xlabel('Weight')
- plt.ylabel('MPG')
- plt.show()
- # TODO: standardization
- mean = np.mean(x_train, axis=0)
- odchylenie = np.std(x_train, axis=0)
- x_train_scaled = (x_train - mean) / odchylenie
- x_test_scaled = (x_test - mean) / odchylenie
- X_b = np.c_[np.ones((x_train_scaled.shape[0], 1)), x_train_scaled]
- X_test_b = np.c_[np.ones((x_test_scaled.shape[0], 1)), x_test_scaled]
- # TODO: calculate theta using Batch Gradient Descent
- lr = 0.01
- epochs = 5000
- theta_best = np.zeros((X_b.shape[1], 1))
- for _ in range(epochs):
- gradient = (2 / X_b.shape[0]) * X_b.T @ (X_b @ theta_best - y_train)
- theta_best -= lr * gradient
- print(f"Optymalne theta: {theta_best.flatten()}")
- # TODO: calculate error
- y_pred = X_test_b @ theta_best
- MSE_test = np.mean((y_test - y_pred) ** 2)
- print(f"MSE gradient: {MSE_test}")
- # plot the regression line
- x = np.linspace(min(x_test_scaled), max(x_test_scaled), 100)
- y = float(theta_best[0]) + float(theta_best[1]) * x
- plt.plot(x, y)
- plt.scatter(x_test_scaled, y_test)
- plt.xlabel('Weight')
- plt.ylabel('MPG')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement