Guest User

Untitled

a guest
Dec 18th, 2018
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.14 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "**Machine Learning \n",
  8. "Robert Knox**"
  9. ]
  10. },
  11. {
  12. "cell_type": "markdown",
  13. "metadata": {},
  14. "source": [
  15. "Assignment details:\n",
  16. "\n",
  17. "Use Linear equation normal equation to predict water temperature T_degC\n",
  18. "\n",
  19. "1) Only use 'Salnty', 'STheta' for predictors\n",
  20. "\n",
  21. "2) Remove NaN / NA values from dataset (prior to building train/test sets). \n",
  22. "\n",
  23. "3) Solve for rmse, variance explained, and r-squared."
  24. ]
  25. },
  26. {
  27. "cell_type": "markdown",
  28. "metadata": {},
  29. "source": [
  30. "## Step 1: Import"
  31. ]
  32. },
  33. {
  34. "cell_type": "code",
  35. "execution_count": 1,
  36. "metadata": {},
  37. "outputs": [],
  38. "source": [
  39. "import numpy as np\n",
  40. "from numpy import linalg as LA\n",
  41. "import pandas as pd\n",
  42. "import os\n",
  43. "from sklearn.linear_model import LinearRegression\n",
  44. "import sklearn.metrics as metrics\n"
  45. ]
  46. },
  47. {
  48. "cell_type": "markdown",
  49. "metadata": {},
  50. "source": [
  51. "## Step 2: Read in Data"
  52. ]
  53. },
  54. {
  55. "cell_type": "code",
  56. "execution_count": 2,
  57. "metadata": {},
  58. "outputs": [],
  59. "source": [
  60. "with open('bottle.csv') as f:\n",
  61. " line = f.readline()\n",
  62. "line_column_names = line.split(',')\n",
  63. "\n",
  64. "#47 (MeanAq) & 73 (pH1) have mixed types so I only use the desired columns\n",
  65. "\n",
  66. "desired_cols = ['T_degC','Salnty','STheta']\n",
  67. "df = pd.read_csv('bottle.csv',sep=',',header='infer',index_col=None,usecols=desired_cols)"
  68. ]
  69. },
  70. {
  71. "cell_type": "code",
  72. "execution_count": 3,
  73. "metadata": {},
  74. "outputs": [
  75. {
  76. "data": {
  77. "text/html": [
  78. "<div>\n",
  79. "<style scoped>\n",
  80. " .dataframe tbody tr th:only-of-type {\n",
  81. " vertical-align: middle;\n",
  82. " }\n",
  83. "\n",
  84. " .dataframe tbody tr th {\n",
  85. " vertical-align: top;\n",
  86. " }\n",
  87. "\n",
  88. " .dataframe thead th {\n",
  89. " text-align: right;\n",
  90. " }\n",
  91. "</style>\n",
  92. "<table border=\"1\" class=\"dataframe\">\n",
  93. " <thead>\n",
  94. " <tr style=\"text-align: right;\">\n",
  95. " <th></th>\n",
  96. " <th>T_degC</th>\n",
  97. " <th>Salnty</th>\n",
  98. " <th>STheta</th>\n",
  99. " </tr>\n",
  100. " </thead>\n",
  101. " <tbody>\n",
  102. " <tr>\n",
  103. " <th>0</th>\n",
  104. " <td>10.50</td>\n",
  105. " <td>33.440</td>\n",
  106. " <td>25.649</td>\n",
  107. " </tr>\n",
  108. " <tr>\n",
  109. " <th>1</th>\n",
  110. " <td>10.46</td>\n",
  111. " <td>33.440</td>\n",
  112. " <td>25.656</td>\n",
  113. " </tr>\n",
  114. " <tr>\n",
  115. " <th>2</th>\n",
  116. " <td>10.46</td>\n",
  117. " <td>33.437</td>\n",
  118. " <td>25.654</td>\n",
  119. " </tr>\n",
  120. " <tr>\n",
  121. " <th>3</th>\n",
  122. " <td>10.45</td>\n",
  123. " <td>33.420</td>\n",
  124. " <td>25.643</td>\n",
  125. " </tr>\n",
  126. " <tr>\n",
  127. " <th>4</th>\n",
  128. " <td>10.45</td>\n",
  129. " <td>33.421</td>\n",
  130. " <td>25.643</td>\n",
  131. " </tr>\n",
  132. " </tbody>\n",
  133. "</table>\n",
  134. "</div>"
  135. ],
  136. "text/plain": [
  137. " T_degC Salnty STheta\n",
  138. "0 10.50 33.440 25.649\n",
  139. "1 10.46 33.440 25.656\n",
  140. "2 10.46 33.437 25.654\n",
  141. "3 10.45 33.420 25.643\n",
  142. "4 10.45 33.421 25.643"
  143. ]
  144. },
  145. "execution_count": 3,
  146. "metadata": {},
  147. "output_type": "execute_result"
  148. }
  149. ],
  150. "source": [
  151. "df.head()"
  152. ]
  153. },
  154. {
  155. "cell_type": "markdown",
  156. "metadata": {},
  157. "source": [
  158. "## Step 3: Handle Nulls & NAN"
  159. ]
  160. },
  161. {
  162. "cell_type": "code",
  163. "execution_count": 4,
  164. "metadata": {},
  165. "outputs": [],
  166. "source": [
  167. "df.dropna(axis=0,inplace=True)"
  168. ]
  169. },
  170. {
  171. "cell_type": "code",
  172. "execution_count": 5,
  173. "metadata": {},
  174. "outputs": [
  175. {
  176. "data": {
  177. "text/plain": [
  178. "T_degC False\n",
  179. "Salnty False\n",
  180. "STheta False\n",
  181. "dtype: bool"
  182. ]
  183. },
  184. "execution_count": 5,
  185. "metadata": {},
  186. "output_type": "execute_result"
  187. }
  188. ],
  189. "source": [
  190. "#make sure it worked\n",
  191. "df.isna().any()"
  192. ]
  193. },
  194. {
  195. "cell_type": "code",
  196. "execution_count": 6,
  197. "metadata": {},
  198. "outputs": [],
  199. "source": [
  200. "df[\"Intercept\"] = np.ones(len(df))"
  201. ]
  202. },
  203. {
  204. "cell_type": "markdown",
  205. "metadata": {},
  206. "source": [
  207. "## Step 4: Convert to arrays & solve using the Normal Equation\n"
  208. ]
  209. },
  210. {
  211. "cell_type": "code",
  212. "execution_count": 7,
  213. "metadata": {},
  214. "outputs": [
  215. {
  216. "name": "stdout",
  217. "output_type": "stream",
  218. "text": [
  219. "theta_best: [89.7647 -0.0555 -2.9838]\n",
  220. "\n",
  221. "RMSE:\t\t\t 2.3595\n",
  222. "Variance Explained:\t 0.6875\n",
  223. "R-Squared:\t\t 0.6875\n"
  224. ]
  225. }
  226. ],
  227. "source": [
  228. "T_degC = df['T_degC'].values\n",
  229. "Intercept = df['Intercept'].values\n",
  230. "Salnty = df['Salnty'].values\n",
  231. "STheta = df['STheta'].values\n",
  232. "\n",
  233. "#build our data matrix A\n",
  234. "X = np.column_stack((Intercept,Salnty,STheta))\n",
  235. "\n",
  236. "#create the transpose of the data matrix for ease of calculation\n",
  237. "Xtran = np.transpose(X)\n",
  238. "\n",
  239. "#define our y_true as T_degC\n",
  240. "y_true = T_degC.copy()\n",
  241. "#print(\"y_true: \",y_true)\n",
  242. "\n",
  243. "theta_best = LA.inv((Xtran.dot(X))).dot(Xtran).dot(y_true)\n",
  244. "yhat = X.dot(theta_best)\n",
  245. "\n",
  246. "#Make the output look nice with array_str\n",
  247. "print(\"theta_best: \",np.array_str(theta_best,precision=4,suppress_small=True))\n",
  248. "\n",
  249. "ybar = sum(y_true)/len(y_true)\n",
  250. "#print(\"ybar\",ybar)\n",
  251. "\n",
  252. "#alternative calculations\n",
  253. "#SST = sum((y_true-ybar)**2)\n",
  254. "#SSM = sum((yhat-ybar)**2)\n",
  255. "#SSE = sum((y_true-yhat)**2)\n",
  256. "\n",
  257. "#Calculate Variance Explained\n",
  258. "var_explained = 1-np.cov(y_true-yhat)/np.cov(y_true)\n",
  259. "\n",
  260. "#Calculate root mean square error\n",
  261. "rmse = (sum((yhat-y_true)**2)/len(y_true))**0.5\n",
  262. "\n",
  263. "#Calculate R-Squared (not adjusted)\n",
  264. "r_squared = 1-((y_true-yhat)**2).sum()/(len(y_true)*(y_true.std()**2))\n",
  265. "\n",
  266. "print(\"\\nRMSE:\\t\\t\\t\",np.round(rmse,4))\n",
  267. "print(\"Variance Explained:\\t\",np.round(var_explained,4))\n",
  268. "print(\"R-Squared:\\t\\t\",np.round(r_squared,4))"
  269. ]
  270. },
  271. {
  272. "cell_type": "markdown",
  273. "metadata": {},
  274. "source": [
  275. "## Check using Scikit learn linear model"
  276. ]
  277. },
  278. {
  279. "cell_type": "code",
  280. "execution_count": 8,
  281. "metadata": {},
  282. "outputs": [
  283. {
  284. "name": "stdout",
  285. "output_type": "stream",
  286. "text": [
  287. "Intercept: 89.76473480839081 \n",
  288. "Thetas: [ 0. -0.05547897 -2.98377603]\n",
  289. "R-Squared: 0.6875216833872659\n",
  290. "RMSE 2.3595303631129534\n"
  291. ]
  292. }
  293. ],
  294. "source": [
  295. "lin_reg = LinearRegression()\n",
  296. "reg = lin_reg.fit(X, y_true)\n",
  297. "print(\"Intercept: \",reg.intercept_,\"\\nThetas: \", reg.coef_)\n",
  298. "print(\"R-Squared:\",reg.score(X,y_true))\n",
  299. "yhat_sk = reg.predict(X)\n",
  300. "print(\"RMSE\",metrics.mean_squared_error(y_true,yhat_sk)**0.5)"
  301. ]
  302. }
  303. ],
  304. "metadata": {
  305. "kernelspec": {
  306. "display_name": "Python 3",
  307. "language": "python",
  308. "name": "python3"
  309. },
  310. "language_info": {
  311. "codemirror_mode": {
  312. "name": "ipython",
  313. "version": 3
  314. },
  315. "file_extension": ".py",
  316. "mimetype": "text/x-python",
  317. "name": "python",
  318. "nbconvert_exporter": "python",
  319. "pygments_lexer": "ipython3",
  320. "version": "3.6.4"
  321. },
  322. "toc": {
  323. "base_numbering": 1,
  324. "nav_menu": {},
  325. "number_sections": false,
  326. "sideBar": true,
  327. "skip_h1_title": false,
  328. "title_cell": "Table of Contents",
  329. "title_sidebar": "Contents",
  330. "toc_cell": false,
  331. "toc_position": {
  332. "height": "calc(100% - 180px)",
  333. "left": "10px",
  334. "top": "150px",
  335. "width": "165px"
  336. },
  337. "toc_section_display": true,
  338. "toc_window_display": true
  339. }
  340. },
  341. "nbformat": 4,
  342. "nbformat_minor": 2
  343. }
Add Comment
Please, Sign In to add comment