Untitled

import numpy as np
import pandas as pd

# notes for the entire notebook:
# m refers to the number of training examples
# n refers to the number of features (not including the bias term)
# hAll refers to the hypothesized output for all training examples (m-dimensional vector)
# yAll refers to the actual output for all training examples (m-dimensional vector)
# X refers to the entire design matrix (m x (n + 1) dimensional matrix)
# x refers to one example (n-dimensional matrix) -> the bias term of 1 is added later

# ALL VECTORS ARE COLUMN VECTORS WHEN PASSED AS PARAMETERS AND DECLARED.

# New Cell
# takes the csv file and normalizes
def import_and_clean_data(text_file_name):
    data = pd.read_csv(text_file_name, sep=",", header=None)
    data.columns = ["sq. feet", "# bedrooms", "House sale price"] # can add more features if necessary
    avgValues = data.mean()
    minValues = data.min()
    maxValues = data.max()
    data = (data - avgValues) / (maxValues - minValues) # mean normalization of features

    return data, minValues, maxValues, avgValues

# Parameter dimensions:
#   theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
#   X:     m x (n + 1) (each row is an training example, each column represents the value of a feature (except for the first column, which is filled with ones to account for bias term))
#   yAll:  m x 1 (each element is the actual output of each training example of the same row number in X)
# computes linear regression cost
def compute_cost(theta, X, yAll, lmbd):
    m = len(X)
    hAll = np.matmul(X, theta)

    reg_term = lmbd * theta**2
    reg_term[0] = 0

    return 1 / (2 * m) * (np.sum((hAll - yAll)**2) + reg_term)

# Parameter dimensions:
#   x:      n x 1 (each element is the feature value for the slot index's respective feature-- the bias feature of 1 is added in the function)
#   theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
# computes the hypothesized output given the input vector x (n-dimensional vector) and the parameter vector
# theta.
def h(x, theta):
    x = np.vstack((np.ones((1, 1)), x))
    return np.matmul(theta.T, x)

# Parameter dimensions:
#   data: nil -> ends up being m x n and then m x (n + 1) when the bias column is added
#   num_iters: nil
#   lr: nil
# Local variable dimensions after manipulation:
#   X:     m x (n + 1) -> we drop the label column and add a bias column
#   y:     m x 1 -> we just take the label column
#   theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
# optimizes the theta vector to parametrize a line that best fits the training data by performing gradient descent
def train_linear_regression(data, num_iters, lr, lmbd):
    X = np.array(data.drop(["House sale price"], axis=1))
    X = np.hstack((np.ones((len(X), 1)), X))
    yAll = np.reshape(np.array(data["House sale price"]), (-1, 1))
    theta = np.zeros((len(X[0]), 1))

    m = len(X)

    for i in range(num_iters):
        print(compute_cost(theta, X, yAll, lmbd))
        hAll = np.matmul(X, theta)

        reg_term = lmbd/m * theta
        reg_term[0] = 0

        theta = theta - lr * ((1 / len(X)) * np.matmul((hAll - yAll).T, X).T + reg_term)
    return theta


# Local variable dimensions after manipulation:
#   maxs: (n x 1) (maxs for each feature, label removed)
#   mins: (n x 1) (mins for each feature, label removed)
#   avgs: (n x 1) (avgs for each feature, label removed)
# takes in normal inputs for each feature and mean normalizes them (according to the calculated scaling values when the dataset was created)
def formatInput(x, maxVals, minVals, avgVals):
    maxVals = maxVals.drop(["House sale price"], axis = 0)
    minVals = minVals.drop(["House sale price"], axis = 0)
    avgVals = avgVals.drop(["House sale price"], axis = 0)

    maxs = np.array([maxVals])
    mins = np.array([minVals])
    avgs = np.array([avgVals])

    adjustedX = ((x.T - avgs) / (maxs - mins)).T
    return adjustedX

# when plugging in a point to test regression, you simply unscale the hypothesis value!
def unscale_h(scaledH, maxVals, minVals, avgVals):
    maxs = np.array(maxVals["House sale price"])
    mins = np.array(minVals["House sale price"])
    avgs = np.array(avgVals["House sale price"])
    return (maxs - mins) * scaledH + avgs

# New cell
dataset, minVals, maxVals, avgVals = import_and_clean_data("houseData.txt")
theta = train_linear_regression(dataset, 500, 1, .09)

# New cell
print(theta)
testInput = np.array([1000, 2])
testInput = np.reshape(testInput, (-1, 1))
testInput = formatInput(testInput, maxVals, minVals, avgVals)
print(unscale_h(h(testInput, theta), maxVals, minVals, avgVals))