Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- # notes for the entire notebook:
- # m refers to the number of training examples
- # n refers to the number of features (not including the bias term)
- # hAll refers to the hypothesized output for all training examples (m-dimensional vector)
- # yAll refers to the actual output for all training examples (m-dimensional vector)
- # X refers to the entire design matrix (m x (n + 1) dimensional matrix)
- # x refers to one example (n-dimensional matrix) -> the bias term of 1 is added later
- # ALL VECTORS ARE COLUMN VECTORS WHEN PASSED AS PARAMETERS AND DECLARED.
- # New Cell
- # takes the csv file and normalizes
- def import_and_clean_data(text_file_name):
- data = pd.read_csv(text_file_name, sep=",", header=None)
- data.columns = ["sq. feet", "# bedrooms", "House sale price"] # can add more features if necessary
- avgValues = data.mean()
- minValues = data.min()
- maxValues = data.max()
- data = (data - avgValues) / (maxValues - minValues) # mean normalization of features
- return data, minValues, maxValues, avgValues
- # Parameter dimensions:
- # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
- # X: m x (n + 1) (each row is an training example, each column represents the value of a feature (except for the first column, which is filled with ones to account for bias term))
- # yAll: m x 1 (each element is the actual output of each training example of the same row number in X)
- # computes linear regression cost
- def compute_cost(theta, X, yAll, lmbd):
- m = len(X)
- hAll = np.matmul(X, theta)
- reg_term = lmbd * theta**2
- reg_term[0] = 0
- return 1 / (2 * m) * (np.sum((hAll - yAll)**2) + reg_term)
- # Parameter dimensions:
- # x: n x 1 (each element is the feature value for the slot index's respective feature-- the bias feature of 1 is added in the function)
- # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
- # computes the hypothesized output given the input vector x (n-dimensional vector) and the parameter vector
- # theta.
- def h(x, theta):
- x = np.vstack((np.ones((1, 1)), x))
- return np.matmul(theta.T, x)
- # Parameter dimensions:
- # data: nil -> ends up being m x n and then m x (n + 1) when the bias column is added
- # num_iters: nil
- # lr: nil
- # Local variable dimensions after manipulation:
- # X: m x (n + 1) -> we drop the label column and add a bias column
- # y: m x 1 -> we just take the label column
- # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
- # optimizes the theta vector to parametrize a line that best fits the training data by performing gradient descent
- def train_linear_regression(data, num_iters, lr, lmbd):
- X = np.array(data.drop(["House sale price"], axis=1))
- X = np.hstack((np.ones((len(X), 1)), X))
- yAll = np.reshape(np.array(data["House sale price"]), (-1, 1))
- theta = np.zeros((len(X[0]), 1))
- m = len(X)
- for i in range(num_iters):
- print(compute_cost(theta, X, yAll, lmbd))
- hAll = np.matmul(X, theta)
- reg_term = lmbd/m * theta
- reg_term[0] = 0
- theta = theta - lr * ((1 / len(X)) * np.matmul((hAll - yAll).T, X).T + reg_term)
- return theta
- # Local variable dimensions after manipulation:
- # maxs: (n x 1) (maxs for each feature, label removed)
- # mins: (n x 1) (mins for each feature, label removed)
- # avgs: (n x 1) (avgs for each feature, label removed)
- # takes in normal inputs for each feature and mean normalizes them (according to the calculated scaling values when the dataset was created)
- def formatInput(x, maxVals, minVals, avgVals):
- maxVals = maxVals.drop(["House sale price"], axis = 0)
- minVals = minVals.drop(["House sale price"], axis = 0)
- avgVals = avgVals.drop(["House sale price"], axis = 0)
- maxs = np.array([maxVals])
- mins = np.array([minVals])
- avgs = np.array([avgVals])
- adjustedX = ((x.T - avgs) / (maxs - mins)).T
- return adjustedX
- # when plugging in a point to test regression, you simply unscale the hypothesis value!
- def unscale_h(scaledH, maxVals, minVals, avgVals):
- maxs = np.array(maxVals["House sale price"])
- mins = np.array(minVals["House sale price"])
- avgs = np.array(avgVals["House sale price"])
- return (maxs - mins) * scaledH + avgs
- # New cell
- dataset, minVals, maxVals, avgVals = import_and_clean_data("houseData.txt")
- theta = train_linear_regression(dataset, 500, 1, .09)
- # New cell
- print(theta)
- testInput = np.array([1000, 2])
- testInput = np.reshape(testInput, (-1, 1))
- testInput = formatInput(testInput, maxVals, minVals, avgVals)
- print(unscale_h(h(testInput, theta), maxVals, minVals, avgVals))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement