Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Machine Learning \n",
- "Robert Knox**"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Assignment details:\n",
- "\n",
- "Use Linear equation normal equation to predict water temperature T_degC\n",
- "\n",
- "1) Only use 'Salnty', 'STheta' for predictors\n",
- "\n",
- "2) Remove NaN / NA values from dataset (prior to building train/test sets). \n",
- "\n",
- "3) Solve for rmse, variance explained, and r-squared."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Step 1: Import"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "from numpy import linalg as LA\n",
- "import pandas as pd\n",
- "import os\n",
- "from sklearn.linear_model import LinearRegression\n",
- "import sklearn.metrics as metrics\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Step 2: Read in Data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('bottle.csv') as f:\n",
- " line = f.readline()\n",
- "line_column_names = line.split(',')\n",
- "\n",
- "#47 (MeanAq) & 73 (pH1) have mixed types so I only use the desired columns\n",
- "\n",
- "desired_cols = ['T_degC','Salnty','STheta']\n",
- "df = pd.read_csv('bottle.csv',sep=',',header='infer',index_col=None,usecols=desired_cols)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>T_degC</th>\n",
- " <th>Salnty</th>\n",
- " <th>STheta</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>10.50</td>\n",
- " <td>33.440</td>\n",
- " <td>25.649</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>10.46</td>\n",
- " <td>33.440</td>\n",
- " <td>25.656</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>10.46</td>\n",
- " <td>33.437</td>\n",
- " <td>25.654</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>10.45</td>\n",
- " <td>33.420</td>\n",
- " <td>25.643</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>10.45</td>\n",
- " <td>33.421</td>\n",
- " <td>25.643</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " T_degC Salnty STheta\n",
- "0 10.50 33.440 25.649\n",
- "1 10.46 33.440 25.656\n",
- "2 10.46 33.437 25.654\n",
- "3 10.45 33.420 25.643\n",
- "4 10.45 33.421 25.643"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Step 3: Handle Nulls & NAN"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.dropna(axis=0,inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "T_degC False\n",
- "Salnty False\n",
- "STheta False\n",
- "dtype: bool"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#make sure it worked\n",
- "df.isna().any()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "df[\"Intercept\"] = np.ones(len(df))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Step 4: Convert to arrays & solve using the Normal Equation\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "theta_best: [89.7647 -0.0555 -2.9838]\n",
- "\n",
- "RMSE:\t\t\t 2.3595\n",
- "Variance Explained:\t 0.6875\n",
- "R-Squared:\t\t 0.6875\n"
- ]
- }
- ],
- "source": [
- "T_degC = df['T_degC'].values\n",
- "Intercept = df['Intercept'].values\n",
- "Salnty = df['Salnty'].values\n",
- "STheta = df['STheta'].values\n",
- "\n",
- "#build our data matrix A\n",
- "X = np.column_stack((Intercept,Salnty,STheta))\n",
- "\n",
- "#create the transpose of the data matrix for ease of calculation\n",
- "Xtran = np.transpose(X)\n",
- "\n",
- "#define our y_true as T_degC\n",
- "y_true = T_degC.copy()\n",
- "#print(\"y_true: \",y_true)\n",
- "\n",
- "theta_best = LA.inv((Xtran.dot(X))).dot(Xtran).dot(y_true)\n",
- "yhat = X.dot(theta_best)\n",
- "\n",
- "#Make the output look nice with array_str\n",
- "print(\"theta_best: \",np.array_str(theta_best,precision=4,suppress_small=True))\n",
- "\n",
- "ybar = sum(y_true)/len(y_true)\n",
- "#print(\"ybar\",ybar)\n",
- "\n",
- "#alternative calculations\n",
- "#SST = sum((y_true-ybar)**2)\n",
- "#SSM = sum((yhat-ybar)**2)\n",
- "#SSE = sum((y_true-yhat)**2)\n",
- "\n",
- "#Calculate Variance Explained\n",
- "var_explained = 1-np.cov(y_true-yhat)/np.cov(y_true)\n",
- "\n",
- "#Calculate root mean square error\n",
- "rmse = (sum((yhat-y_true)**2)/len(y_true))**0.5\n",
- "\n",
- "#Calculate R-Squared (not adjusted)\n",
- "r_squared = 1-((y_true-yhat)**2).sum()/(len(y_true)*(y_true.std()**2))\n",
- "\n",
- "print(\"\\nRMSE:\\t\\t\\t\",np.round(rmse,4))\n",
- "print(\"Variance Explained:\\t\",np.round(var_explained,4))\n",
- "print(\"R-Squared:\\t\\t\",np.round(r_squared,4))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Check using Scikit learn linear model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Intercept: 89.76473480839081 \n",
- "Thetas: [ 0. -0.05547897 -2.98377603]\n",
- "R-Squared: 0.6875216833872659\n",
- "RMSE 2.3595303631129534\n"
- ]
- }
- ],
- "source": [
- "lin_reg = LinearRegression()\n",
- "reg = lin_reg.fit(X, y_true)\n",
- "print(\"Intercept: \",reg.intercept_,\"\\nThetas: \", reg.coef_)\n",
- "print(\"R-Squared:\",reg.score(X,y_true))\n",
- "yhat_sk = reg.predict(X)\n",
- "print(\"RMSE\",metrics.mean_squared_error(y_true,yhat_sk)**0.5)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.4"
- },
- "toc": {
- "base_numbering": 1,
- "nav_menu": {},
- "number_sections": false,
- "sideBar": true,
- "skip_h1_title": false,
- "title_cell": "Table of Contents",
- "title_sidebar": "Contents",
- "toc_cell": false,
- "toc_position": {
- "height": "calc(100% - 180px)",
- "left": "10px",
- "top": "150px",
- "width": "165px"
- },
- "toc_section_display": true,
- "toc_window_display": true
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment