Untitled

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "df <- read.csv(\"logistic.csv\")\n",
    "#head(df, 15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Splitting the dataset\n",
    "\n",
    "n_observations <- nrow(df)\n",
    "perm_rows      <- sample(n_observations)\n",
    "df_shuffled    <- df[perm_rows, ]\n",
    "split          <- round(n_observations* 0.75)\n",
    "train          <- df_shuffled[1:split, ]\n",
    "test           <- df_shuffled[(split +1): n_observations, ]\n",
    "\n",
    "X_train <- as.matrix(train[, c(1:2)])\n",
    "Y_train <- as.matrix(train[, c(3)])\n",
    "X_test  <- as.matrix(test[ ,c(1:2)]) \n",
    "Y_test  <- as.matrix(test[,c(3)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "###Sigmoid function\n",
    "\n",
    "sigmoid <- function(x) {sig <- 1/(1+exp(-x)) \n",
    "                     return(sig)}\n",
    " \n",
    "\n",
    "## Logistic Loss Function: \n",
    "cost_function<- function(t){\n",
    "    ## Taking in the x values with num rows\n",
    "    instances <- nrow(X)\n",
    "    ## Defining the output of the sigmoid function by X matrix matmul theta    \n",
    "    sig    <- sigmoid(X%*%t)\n",
    "    \n",
    "    ## Approximating the cost function -1/m[E y(i)logh0(x(i)) + (1- y(i))log(1-h0(x(i)))]\n",
    "    c <- (1/instances) * sum((-Y * log(sig)) - ((1-Y) * log(1-sig)))\n",
    "    return(c)\n",
    "    \n",
    "}\n",
    "\n",
    "## Applying the logistic regression\n",
    "logit <- function(X,Y, sigmoid, cost, fit = TRUE, optimized_thetas = None, prob_return = TRUE, threshold = 0.5){\n",
    "    ## This library is going to be used for my accuracy assessments\n",
    "    library(\"caret\")\n",
    "    ## This will be used if we are fitting to the dataset\n",
    "    if(fit == TRUE){\n",
    "        ## Initializing theta\n",
    "        init_theta <- rep(0, ncol(X))\n",
    "        print(\"Initial cost with intial Thetas: \")\n",
    "        ## Declaring X and Y as global variable    \n",
    "        X <<- X\n",
    "        Y <<- Y\n",
    "        ## The initial cost of the model with unoptimized theta\n",
    "        print(cost_function(init_theta))\n",
    "        ## Optimizing theta to reduce overall cost    \n",
    "        optimized_theta <-solve(t(X) %*% X) %*% t(X) %*% Y ## The solve function is meant to solve for the inverse of the matrix resulting from transpose(x) * x, then matmul out the inverse by x by y. the t function is simply transposing the matrix \n",
    "        print(\"Optimized Parameters(Thetas): \")\n",
    "        print(optimized_theta)\n",
    "        print(\"Optimized Cost: \")\n",
    "        print(cost_function(optimized_theta))\n",
    "        return(optimized_theta)\n",
    "    }else{\n",
    "        ## calculating the probabilities of the predictions by matrix multiplication of X with the optimized thetas\n",
    "        probs <- sigmoid(X %*% optimized_thetas)\n",
    "        ## Creating a vector from the probabilites\n",
    "\n",
    "        probs <- c(probs)\n",
    "        ## Creating an empty vector\n",
    "        classes <- vector()\n",
    "        for (value in probs){\n",
    "                ## Appending the class as one if the prob value is above threshold\n",
    "                if (value > threshold){\n",
    "                    classes <- c(classes, c(1))\n",
    "                    #append(classes, c(1))\n",
    "                ## Reverse is true    \n",
    "                }else{\n",
    "                    classes <- c(classes, c(0))\n",
    "                    #append(classes, c(0))\n",
    "                } \n",
    "                \n",
    "            }\n",
    "        ## printing my accuracy metrics\n",
    "        print(confusionMatrix(as.factor(c(classes)),as.factor(c(Y))))\n",
    "        ## If else depending on if you want probabilites or predictions\n",
    "        if(prob_return == TRUE){\n",
    "            return(c(probs))}else{\n",
    "           return(classes)  \n",
    "        }\n",
    "        \n",
    "    }\n",
    "\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading required package: lattice\n",
      "Loading required package: ggplot2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] \"Initial cost with intial Thetas: \"\n",
      "[1] 0.6931472\n",
      "[1] \"Optimized Parameters(Thetas): \"\n",
      "         [,1]\n",
      "X1 0.02713837\n",
      "X2 0.34743736\n",
      "[1] \"Optimized Cost: \"\n",
      "[1] 0.5936758\n"
     ]
    }
   ],
   "source": [
    "## Training and opimizing the data. \n",
    "optimized_thetas <- logit(X_train,Y_train, sigmoid, cost_function)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Confusion Matrix and Statistics\n",
      "\n",
      "          Reference\n",
      "Prediction   0   1\n",
      "         0 368  18\n",
      "         1 107 257\n",
      "                                          \n",
      "               Accuracy : 0.8333          \n",
      "                 95% CI : (0.8047, 0.8593)\n",
      "    No Information Rate : 0.6333          \n",
      "    P-Value [Acc > NIR] : < 2.2e-16       \n",
      "                                          \n",
      "                  Kappa : 0.664           \n",
      "                                          \n",
      " Mcnemar's Test P-Value : 3.519e-15       \n",
      "                                          \n",
      "            Sensitivity : 0.7747          \n",
      "            Specificity : 0.9345          \n",
      "         Pos Pred Value : 0.9534          \n",
      "         Neg Pred Value : 0.7060          \n",
      "             Prevalence : 0.6333          \n",
      "         Detection Rate : 0.4907          \n",
      "   Detection Prevalence : 0.5147          \n",
      "      Balanced Accuracy : 0.8546          \n",
      "                                          \n",
      "       'Positive' Class : 0               \n",
      "                                          \n"
     ]
    }
   ],
   "source": [
    "## Class predictions with the training set\n",
    "train_set <-logit(X_train,Y_train, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Confusion Matrix and Statistics\n",
      "\n",
      "          Reference\n",
      "Prediction   0   1\n",
      "         0 125   6\n",
      "         1  38  81\n",
      "                                         \n",
      "               Accuracy : 0.824          \n",
      "                 95% CI : (0.771, 0.8691)\n",
      "    No Information Rate : 0.652          \n",
      "    P-Value [Acc > NIR] : 1.309e-09      \n",
      "                                         \n",
      "                  Kappa : 0.6428         \n",
      "                                         \n",
      " Mcnemar's Test P-Value : 2.962e-06      \n",
      "                                         \n",
      "            Sensitivity : 0.7669         \n",
      "            Specificity : 0.9310         \n",
      "         Pos Pred Value : 0.9542         \n",
      "         Neg Pred Value : 0.6807         \n",
      "             Prevalence : 0.6520         \n",
      "         Detection Rate : 0.5000         \n",
      "   Detection Prevalence : 0.5240         \n",
      "      Balanced Accuracy : 0.8490         \n",
      "                                         \n",
      "       'Positive' Class : 0              \n",
      "                                         \n"
     ]
    }
   ],
   "source": [
    "## Class predictions with the test set\n",
    "test_set <- logit(X_test,Y_test, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE, threshold =  0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] \"Train Set Predictions head: \"\n",
      "[1] 0 0 0 1 0 1\n",
      "[1] \"Test Set Predictions head\"\n",
      "[1] 0 1 1 0 0 1\n"
     ]
    }
   ],
   "source": [
    "print(\"Train Set Predictions head: \")\n",
    "print(head(train_set))\n",
    "print(\"Test Set Predictions head\")\n",
    "print(head(test_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "Call:\n",
       "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = train)\n",
       "\n",
       "Deviance Residuals: \n",
       "    Min       1Q   Median       3Q      Max  \n",
       "-3.1716  -0.4111  -0.1143   0.3058   2.7664  \n",
       "\n",
       "Coefficients:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)  -1.2758     0.1410  -9.050   <2e-16 ***\n",
       "X1            0.3090     0.1174   2.632   0.0085 ** \n",
       "X2            3.7959     0.2902  13.082   <2e-16 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "(Dispersion parameter for binomial family taken to be 1)\n",
       "\n",
       "    Null deviance: 985.74  on 749  degrees of freedom\n",
       "Residual deviance: 444.08  on 747  degrees of freedom\n",
       "AIC: 450.08\n",
       "\n",
       "Number of Fisher Scoring iterations: 6\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Confusion Matrix and Statistics\n",
       "\n",
       "          Reference\n",
       "Prediction   0   1\n",
       "         0 447  81\n",
       "         1  28 194\n",
       "                                          \n",
       "               Accuracy : 0.8547          \n",
       "                 95% CI : (0.8274, 0.8791)\n",
       "    No Information Rate : 0.6333          \n",
       "    P-Value [Acc > NIR] : < 2.2e-16       \n",
       "                                          \n",
       "                  Kappa : 0.6738          \n",
       "                                          \n",
       " Mcnemar's Test P-Value : 6.336e-07       \n",
       "                                          \n",
       "            Sensitivity : 0.9411          \n",
       "            Specificity : 0.7055          \n",
       "         Pos Pred Value : 0.8466          \n",
       "         Neg Pred Value : 0.8739          \n",
       "             Prevalence : 0.6333          \n",
       "         Detection Rate : 0.5960          \n",
       "   Detection Prevalence : 0.7040          \n",
       "      Balanced Accuracy : 0.8233          \n",
       "                                          \n",
       "       'Positive' Class : 0               \n",
       "                                          "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "## Using glm logistic regression\n",
    "#train\n",
    "model <- glm(Y ~., family = binomial(link = \"logit\"), data = train)\n",
    "summary(model)\n",
    "\n",
    "p <- predict(model, train[,c(1:2)])\n",
    "\n",
    "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(train[[\"Y\"]])))\n",
    "\n",
    "confusionMatrix(p_class, factor(train[[\"Y\"]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\n",
       "Call:\n",
       "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = test)\n",
       "\n",
       "Deviance Residuals: \n",
       "     Min        1Q    Median        3Q       Max  \n",
       "-2.45433  -0.37074  -0.07632   0.28170   2.14015  \n",
       "\n",
       "Coefficients:\n",
       "            Estimate Std. Error z value Pr(>|z|)    \n",
       "(Intercept)  -1.3891     0.2666  -5.210 1.89e-07 ***\n",
       "X1            0.3998     0.2248   1.778   0.0753 .  \n",
       "X2            3.7630     0.5117   7.353 1.93e-13 ***\n",
       "---\n",
       "Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
       "\n",
       "(Dispersion parameter for binomial family taken to be 1)\n",
       "\n",
       "    Null deviance: 323.1  on 249  degrees of freedom\n",
       "Residual deviance: 136.9  on 247  degrees of freedom\n",
       "AIC: 142.9\n",
       "\n",
       "Number of Fisher Scoring iterations: 7\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Confusion Matrix and Statistics\n",
       "\n",
       "          Reference\n",
       "Prediction   0   1\n",
       "         0 153  22\n",
       "         1  10  65\n",
       "                                          \n",
       "               Accuracy : 0.872           \n",
       "                 95% CI : (0.8241, 0.9108)\n",
       "    No Information Rate : 0.652           \n",
       "    P-Value [Acc > NIR] : 2.437e-15       \n",
       "                                          \n",
       "                  Kappa : 0.7086          \n",
       "                                          \n",
       " Mcnemar's Test P-Value : 0.05183         \n",
       "                                          \n",
       "            Sensitivity : 0.9387          \n",
       "            Specificity : 0.7471          \n",
       "         Pos Pred Value : 0.8743          \n",
       "         Neg Pred Value : 0.8667          \n",
       "             Prevalence : 0.6520          \n",
       "         Detection Rate : 0.6120          \n",
       "   Detection Prevalence : 0.7000          \n",
       "      Balanced Accuracy : 0.8429          \n",
       "                                          \n",
       "       'Positive' Class : 0               \n",
       "                                          "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model <- glm(Y ~., family = binomial(link = \"logit\"), data = test)\n",
    "summary(model)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "p <- predict(model, test[,c(1:2)])\n",
    "\n",
    "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(test[[\"Y\"]])))\n",
    "\n",
    "confusionMatrix(p_class, factor(test[[\"Y\"]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.4.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}