si

import numpy as np
import matplotlib.pyplot as plt

from data import get_data, inspect_data, split_data

data = get_data()
inspect_data(data)

train_data, test_data = split_data(data)

# Simple Linear Regression
# predict MPG (y, dependent variable) using Weight (x, independent variable) using closed-form solution
# y = theta_0 + theta_1 * x - we want to find theta_0 and theta_1 parameters that minimize the prediction error

# We can calculate the error using MSE metric:
# MSE = SUM (from i=1 to n) (actual_output - predicted_output) ** 2

# get the columns
y_train = train_data['MPG'].to_numpy().reshape(-1, 1)
x_train = train_data['Weight'].to_numpy().reshape(-1, 1)

y_test = test_data['MPG'].to_numpy().reshape(-1, 1)
x_test = test_data['Weight'].to_numpy().reshape(-1, 1)

# TODO: calculate closed-form solution
theta_best = [0, 0]
X_b = np.c_[np.ones((x_train.shape[0], 1)), x_train]
theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train
print(f"theta best: ${theta_best}")
print(f"X_b.shape: ${X_b.shape}")

# TODO: calculate error
predicted = X_b @ theta_best
MSE_best = np.mean((y_train - predicted) ** 2)
print(f"MSE best: ${MSE_best}")

# plot the regression line
x = np.linspace(min(x_test), max(x_test), 100)
y = float(theta_best[0]) + float(theta_best[1]) * x
plt.plot(x, y)
plt.scatter(x_test, y_test)
plt.xlabel('Weight')
plt.ylabel('MPG')
plt.show()

# TODO: standardization
mean = np.mean(x_train, axis=0)
odchylenie = np.std(x_train, axis=0)

x_train_scaled = (x_train - mean) / odchylenie
x_test_scaled = (x_test - mean) / odchylenie

X_b = np.c_[np.ones((x_train_scaled.shape[0], 1)), x_train_scaled]
X_test_b = np.c_[np.ones((x_test_scaled.shape[0], 1)), x_test_scaled]


# TODO: calculate theta using Batch Gradient Descent
lr = 0.01
epochs = 5000
theta_best = np.zeros((X_b.shape[1], 1))
for _ in range(epochs):
    gradient = (2 / X_b.shape[0]) * X_b.T @ (X_b @ theta_best - y_train)
    theta_best -= lr * gradient
print(f"Optymalne theta: {theta_best.flatten()}")

# TODO: calculate error
y_pred = X_test_b @ theta_best

MSE_test = np.mean((y_test - y_pred) ** 2)
print(f"MSE gradient: {MSE_test}")

# plot the regression line
x = np.linspace(min(x_test_scaled), max(x_test_scaled), 100)
y = float(theta_best[0]) + float(theta_best[1]) * x
plt.plot(x, y)
plt.scatter(x_test_scaled, y_test)
plt.xlabel('Weight')
plt.ylabel('MPG')
plt.show()