Advertisement
Guest User

Untitled

a guest
Feb 28th, 2020
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.78 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3.  
  4. # notes for the entire notebook:
  5. # m refers to the number of training examples
  6. # n refers to the number of features (not including the bias term)
  7. # hAll refers to the hypothesized output for all training examples (m-dimensional vector)
  8. # yAll refers to the actual output for all training examples (m-dimensional vector)
  9. # X refers to the entire design matrix (m x (n + 1) dimensional matrix)
  10. # x refers to one example (n-dimensional matrix) -> the bias term of 1 is added later
  11.  
  12. # ALL VECTORS ARE COLUMN VECTORS WHEN PASSED AS PARAMETERS AND DECLARED.
  13.  
  14. # New Cell
  15. # takes the csv file and normalizes
  16. def import_and_clean_data(text_file_name):
  17. data = pd.read_csv(text_file_name, sep=",", header=None)
  18. data.columns = ["sq. feet", "# bedrooms", "House sale price"] # can add more features if necessary
  19. avgValues = data.mean()
  20. minValues = data.min()
  21. maxValues = data.max()
  22. data = (data - avgValues) / (maxValues - minValues) # mean normalization of features
  23.  
  24. return data, minValues, maxValues, avgValues
  25.  
  26. # Parameter dimensions:
  27. # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
  28. # X: m x (n + 1) (each row is an training example, each column represents the value of a feature (except for the first column, which is filled with ones to account for bias term))
  29. # yAll: m x 1 (each element is the actual output of each training example of the same row number in X)
  30. # computes linear regression cost
  31. def compute_cost(theta, X, yAll, lmbd):
  32. m = len(X)
  33. hAll = np.matmul(X, theta)
  34.  
  35. reg_term = lmbd * theta**2
  36. reg_term[0] = 0
  37.  
  38. return 1 / (2 * m) * (np.sum((hAll - yAll)**2) + reg_term)
  39.  
  40. # Parameter dimensions:
  41. # x: n x 1 (each element is the feature value for the slot index's respective feature-- the bias feature of 1 is added in the function)
  42. # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
  43. # computes the hypothesized output given the input vector x (n-dimensional vector) and the parameter vector
  44. # theta.
  45. def h(x, theta):
  46. x = np.vstack((np.ones((1, 1)), x))
  47. return np.matmul(theta.T, x)
  48.  
  49. # Parameter dimensions:
  50. # data: nil -> ends up being m x n and then m x (n + 1) when the bias column is added
  51. # num_iters: nil
  52. # lr: nil
  53. # Local variable dimensions after manipulation:
  54. # X: m x (n + 1) -> we drop the label column and add a bias column
  55. # y: m x 1 -> we just take the label column
  56. # theta: (n + 1) x 1 (first element is bias parameter, followed by the parameters for each feature)
  57. # optimizes the theta vector to parametrize a line that best fits the training data by performing gradient descent
  58. def train_linear_regression(data, num_iters, lr, lmbd):
  59. X = np.array(data.drop(["House sale price"], axis=1))
  60. X = np.hstack((np.ones((len(X), 1)), X))
  61. yAll = np.reshape(np.array(data["House sale price"]), (-1, 1))
  62. theta = np.zeros((len(X[0]), 1))
  63.  
  64. m = len(X)
  65.  
  66. for i in range(num_iters):
  67. print(compute_cost(theta, X, yAll, lmbd))
  68. hAll = np.matmul(X, theta)
  69.  
  70. reg_term = lmbd/m * theta
  71. reg_term[0] = 0
  72.  
  73. theta = theta - lr * ((1 / len(X)) * np.matmul((hAll - yAll).T, X).T + reg_term)
  74. return theta
  75.  
  76.  
  77. # Local variable dimensions after manipulation:
  78. # maxs: (n x 1) (maxs for each feature, label removed)
  79. # mins: (n x 1) (mins for each feature, label removed)
  80. # avgs: (n x 1) (avgs for each feature, label removed)
  81. # takes in normal inputs for each feature and mean normalizes them (according to the calculated scaling values when the dataset was created)
  82. def formatInput(x, maxVals, minVals, avgVals):
  83. maxVals = maxVals.drop(["House sale price"], axis = 0)
  84. minVals = minVals.drop(["House sale price"], axis = 0)
  85. avgVals = avgVals.drop(["House sale price"], axis = 0)
  86.  
  87. maxs = np.array([maxVals])
  88. mins = np.array([minVals])
  89. avgs = np.array([avgVals])
  90.  
  91. adjustedX = ((x.T - avgs) / (maxs - mins)).T
  92. return adjustedX
  93.  
  94. # when plugging in a point to test regression, you simply unscale the hypothesis value!
  95. def unscale_h(scaledH, maxVals, minVals, avgVals):
  96. maxs = np.array(maxVals["House sale price"])
  97. mins = np.array(minVals["House sale price"])
  98. avgs = np.array(avgVals["House sale price"])
  99. return (maxs - mins) * scaledH + avgs
  100.  
  101. # New cell
  102. dataset, minVals, maxVals, avgVals = import_and_clean_data("houseData.txt")
  103. theta = train_linear_regression(dataset, 500, 1, .09)
  104.  
  105. # New cell
  106. print(theta)
  107. testInput = np.array([1000, 2])
  108. testInput = np.reshape(testInput, (-1, 1))
  109. testInput = formatInput(testInput, maxVals, minVals, avgVals)
  110. print(unscale_h(h(testInput, theta), maxVals, minVals, avgVals))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement