Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.utils import shuffle
- class LinearRegression:
- def __init__(self,):
- pass
- def fit(self, X, y):
- self.X = X
- self.y = y
- self.w = np.linalg.inv(self.X.T @ self.X) @ self.X.T @ self.y
- def predict(self, X):
- self.X = X
- self.prediction = self.X @ self.w
- return self.prediction
- def score(self, X, y):
- self.X = X
- self.y = y
- self.sqr_error = sum((self.y - self.predict(self.X))**2)
- self.sqr_error_mean = sum((self.y - np.mean(self.predict(self.X)))**2)
- return 1 - self.sqr_error/ self.sqr_error_mean
- def transformDf(df,array, train_size=0.2):
- temp_df = df
- for x in range(len(array)):
- temp_df = pd.get_dummies(temp_df, columns=[array[x]])
- temp_df = temp_df.drop(temp_df.columns[-1], axis=1)
- #new_df = np.append(np.ones((new_df.shape[0],1)), new_df, axis=1)
- train_df = temp_df[: round(temp_df.shape[0] * train_size)]
- test_df = temp_df[round(temp_df.shape[0]* train_size): round(temp_df.shape[0] * train_size * 2)]
- return train_df, test_df
- df = pd.read_csv("diamonds.csv", index_col=0)
- df = shuffle(df)
- # CUT
- # Fair = 1 0 0 0
- # Good = 0 1 0 0
- # Ideal = 0 0 1 0
- # Premium = 0 0 0 1
- # Very good = 0 0 0 0
- # COLOR
- # D = 1 0 0 0 0
- # E = 0 1 0 0 0
- # F = 0 0 1 0 0
- # G = 0 0 0 1 0
- # I = 0 0 0 0 1
- # J = 0 0 0 0 0
- # CLARITY
- # I1 = 1 0 0 0 0 0 0
- # IF = 0 1 0 0 0 0 0
- # SI1 = 0 0 1 0 0 0 0
- # SI2 = 0 0 0 1 0 0 0
- # VS1 = 0 0 0 0 1 0 0
- # VS2 = 0 0 0 0 0 1 0
- # VVS1 = 0 0 0 0 0 0 1
- # VVS2 = 0 0 0 0 0 0 0
- #Variables
- TRAIN_SIZE = 0.2
- dummies = np.array(["color","cut","clarity"])
- df_train, df_test = transformDf(df,dummies, TRAIN_SIZE)
- #Features Train
- X_train = np.array(df_train.drop(["price"],axis=1))
- X_train = np.append(np.ones((X_train.shape[0],1)), X_train , axis=1)
- #Label Train
- y_train = np.array(df_train["price"])
- #Features Test
- X_test = np.array(df_test.drop(["price"],axis=1))
- X_test = np.append(np.ones((X_test.shape[0],1)), X_test , axis=1)
- #Labels Test
- y_test = np.array(df_test["price"])
- model = LinearRegression()
- model.fit(X_train, y_train)
- predicts = model.predict(X_test)
- for i in range(10):
- print(f"Predict {predicts[i]} / Real {y_test[i]}")
- print(model.score(X_test, y_test))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement