Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "df <- read.csv(\"logistic.csv\")\n",
- "#head(df, 15)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Splitting the dataset\n",
- "\n",
- "n_observations <- nrow(df)\n",
- "perm_rows <- sample(n_observations)\n",
- "df_shuffled <- df[perm_rows, ]\n",
- "split <- round(n_observations* 0.75)\n",
- "train <- df_shuffled[1:split, ]\n",
- "test <- df_shuffled[(split +1): n_observations, ]\n",
- "\n",
- "X_train <- as.matrix(train[, c(1:2)])\n",
- "Y_train <- as.matrix(train[, c(3)])\n",
- "X_test <- as.matrix(test[ ,c(1:2)]) \n",
- "Y_test <- as.matrix(test[,c(3)])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "###Sigmoid function\n",
- "\n",
- "sigmoid <- function(x) {sig <- 1/(1+exp(-x)) \n",
- " return(sig)}\n",
- " \n",
- "\n",
- "## Logistic Loss Function: \n",
- "cost_function<- function(t){\n",
- " ## Taking in the x values with num rows\n",
- " instances <- nrow(X)\n",
- " ## Defining the output of the sigmoid function by X matrix matmul theta \n",
- " sig <- sigmoid(X%*%t)\n",
- " \n",
- " ## Approximating the cost function -1/m[E y(i)logh0(x(i)) + (1- y(i))log(1-h0(x(i)))]\n",
- " c <- (1/instances) * sum((-Y * log(sig)) - ((1-Y) * log(1-sig)))\n",
- " return(c)\n",
- " \n",
- "}\n",
- "\n",
- "## Applying the logistic regression\n",
- "logit <- function(X,Y, sigmoid, cost, fit = TRUE, optimized_thetas = None, prob_return = TRUE, threshold = 0.5){\n",
- " ## This library is going to be used for my accuracy assessments\n",
- " library(\"caret\")\n",
- " ## This will be used if we are fitting to the dataset\n",
- " if(fit == TRUE){\n",
- " ## Initializing theta\n",
- " init_theta <- rep(0, ncol(X))\n",
- " print(\"Initial cost with intial Thetas: \")\n",
- " ## Declaring X and Y as global variable \n",
- " X <<- X\n",
- " Y <<- Y\n",
- " ## The initial cost of the model with unoptimized theta\n",
- " print(cost_function(init_theta))\n",
- " ## Optimizing theta to reduce overall cost \n",
- " optimized_theta <-solve(t(X) %*% X) %*% t(X) %*% Y ## The solve function is meant to solve for the inverse of the matrix resulting from transpose(x) * x, then matmul out the inverse by x by y. the t function is simply transposing the matrix \n",
- " print(\"Optimized Parameters(Thetas): \")\n",
- " print(optimized_theta)\n",
- " print(\"Optimized Cost: \")\n",
- " print(cost_function(optimized_theta))\n",
- " return(optimized_theta)\n",
- " }else{\n",
- " ## calculating the probabilities of the predictions by matrix multiplication of X with the optimized thetas\n",
- " probs <- sigmoid(X %*% optimized_thetas)\n",
- " ## Creating a vector from the probabilites\n",
- "\n",
- " probs <- c(probs)\n",
- " ## Creating an empty vector\n",
- " classes <- vector()\n",
- " for (value in probs){\n",
- " ## Appending the class as one if the prob value is above threshold\n",
- " if (value > threshold){\n",
- " classes <- c(classes, c(1))\n",
- " #append(classes, c(1))\n",
- " ## Reverse is true \n",
- " }else{\n",
- " classes <- c(classes, c(0))\n",
- " #append(classes, c(0))\n",
- " } \n",
- " \n",
- " }\n",
- " ## printing my accuracy metrics\n",
- " print(confusionMatrix(as.factor(c(classes)),as.factor(c(Y))))\n",
- " ## If else depending on if you want probabilites or predictions\n",
- " if(prob_return == TRUE){\n",
- " return(c(probs))}else{\n",
- " return(classes) \n",
- " }\n",
- " \n",
- " }\n",
- "\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Loading required package: lattice\n",
- "Loading required package: ggplot2\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[1] \"Initial cost with intial Thetas: \"\n",
- "[1] 0.6931472\n",
- "[1] \"Optimized Parameters(Thetas): \"\n",
- " [,1]\n",
- "X1 0.02713837\n",
- "X2 0.34743736\n",
- "[1] \"Optimized Cost: \"\n",
- "[1] 0.5936758\n"
- ]
- }
- ],
- "source": [
- "## Training and opimizing the data. \n",
- "optimized_thetas <- logit(X_train,Y_train, sigmoid, cost_function)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Confusion Matrix and Statistics\n",
- "\n",
- " Reference\n",
- "Prediction 0 1\n",
- " 0 368 18\n",
- " 1 107 257\n",
- " \n",
- " Accuracy : 0.8333 \n",
- " 95% CI : (0.8047, 0.8593)\n",
- " No Information Rate : 0.6333 \n",
- " P-Value [Acc > NIR] : < 2.2e-16 \n",
- " \n",
- " Kappa : 0.664 \n",
- " \n",
- " Mcnemar's Test P-Value : 3.519e-15 \n",
- " \n",
- " Sensitivity : 0.7747 \n",
- " Specificity : 0.9345 \n",
- " Pos Pred Value : 0.9534 \n",
- " Neg Pred Value : 0.7060 \n",
- " Prevalence : 0.6333 \n",
- " Detection Rate : 0.4907 \n",
- " Detection Prevalence : 0.5147 \n",
- " Balanced Accuracy : 0.8546 \n",
- " \n",
- " 'Positive' Class : 0 \n",
- " \n"
- ]
- }
- ],
- "source": [
- "## Class predictions with the training set\n",
- "train_set <-logit(X_train,Y_train, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Confusion Matrix and Statistics\n",
- "\n",
- " Reference\n",
- "Prediction 0 1\n",
- " 0 125 6\n",
- " 1 38 81\n",
- " \n",
- " Accuracy : 0.824 \n",
- " 95% CI : (0.771, 0.8691)\n",
- " No Information Rate : 0.652 \n",
- " P-Value [Acc > NIR] : 1.309e-09 \n",
- " \n",
- " Kappa : 0.6428 \n",
- " \n",
- " Mcnemar's Test P-Value : 2.962e-06 \n",
- " \n",
- " Sensitivity : 0.7669 \n",
- " Specificity : 0.9310 \n",
- " Pos Pred Value : 0.9542 \n",
- " Neg Pred Value : 0.6807 \n",
- " Prevalence : 0.6520 \n",
- " Detection Rate : 0.5000 \n",
- " Detection Prevalence : 0.5240 \n",
- " Balanced Accuracy : 0.8490 \n",
- " \n",
- " 'Positive' Class : 0 \n",
- " \n"
- ]
- }
- ],
- "source": [
- "## Class predictions with the test set\n",
- "test_set <- logit(X_test,Y_test, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE, threshold = 0.5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[1] \"Train Set Predictions head: \"\n",
- "[1] 0 0 0 1 0 1\n",
- "[1] \"Test Set Predictions head\"\n",
- "[1] 0 1 1 0 0 1\n"
- ]
- }
- ],
- "source": [
- "print(\"Train Set Predictions head: \")\n",
- "print(head(train_set))\n",
- "print(\"Test Set Predictions head\")\n",
- "print(head(test_set))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "\n",
- "Call:\n",
- "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = train)\n",
- "\n",
- "Deviance Residuals: \n",
- " Min 1Q Median 3Q Max \n",
- "-3.1716 -0.4111 -0.1143 0.3058 2.7664 \n",
- "\n",
- "Coefficients:\n",
- " Estimate Std. Error z value Pr(>|z|) \n",
- "(Intercept) -1.2758 0.1410 -9.050 <2e-16 ***\n",
- "X1 0.3090 0.1174 2.632 0.0085 ** \n",
- "X2 3.7959 0.2902 13.082 <2e-16 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "\n",
- "(Dispersion parameter for binomial family taken to be 1)\n",
- "\n",
- " Null deviance: 985.74 on 749 degrees of freedom\n",
- "Residual deviance: 444.08 on 747 degrees of freedom\n",
- "AIC: 450.08\n",
- "\n",
- "Number of Fisher Scoring iterations: 6\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "Confusion Matrix and Statistics\n",
- "\n",
- " Reference\n",
- "Prediction 0 1\n",
- " 0 447 81\n",
- " 1 28 194\n",
- " \n",
- " Accuracy : 0.8547 \n",
- " 95% CI : (0.8274, 0.8791)\n",
- " No Information Rate : 0.6333 \n",
- " P-Value [Acc > NIR] : < 2.2e-16 \n",
- " \n",
- " Kappa : 0.6738 \n",
- " \n",
- " Mcnemar's Test P-Value : 6.336e-07 \n",
- " \n",
- " Sensitivity : 0.9411 \n",
- " Specificity : 0.7055 \n",
- " Pos Pred Value : 0.8466 \n",
- " Neg Pred Value : 0.8739 \n",
- " Prevalence : 0.6333 \n",
- " Detection Rate : 0.5960 \n",
- " Detection Prevalence : 0.7040 \n",
- " Balanced Accuracy : 0.8233 \n",
- " \n",
- " 'Positive' Class : 0 \n",
- " "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "## Using glm logistic regression\n",
- "#train\n",
- "model <- glm(Y ~., family = binomial(link = \"logit\"), data = train)\n",
- "summary(model)\n",
- "\n",
- "p <- predict(model, train[,c(1:2)])\n",
- "\n",
- "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(train[[\"Y\"]])))\n",
- "\n",
- "confusionMatrix(p_class, factor(train[[\"Y\"]]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "\n",
- "Call:\n",
- "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = test)\n",
- "\n",
- "Deviance Residuals: \n",
- " Min 1Q Median 3Q Max \n",
- "-2.45433 -0.37074 -0.07632 0.28170 2.14015 \n",
- "\n",
- "Coefficients:\n",
- " Estimate Std. Error z value Pr(>|z|) \n",
- "(Intercept) -1.3891 0.2666 -5.210 1.89e-07 ***\n",
- "X1 0.3998 0.2248 1.778 0.0753 . \n",
- "X2 3.7630 0.5117 7.353 1.93e-13 ***\n",
- "---\n",
- "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
- "\n",
- "(Dispersion parameter for binomial family taken to be 1)\n",
- "\n",
- " Null deviance: 323.1 on 249 degrees of freedom\n",
- "Residual deviance: 136.9 on 247 degrees of freedom\n",
- "AIC: 142.9\n",
- "\n",
- "Number of Fisher Scoring iterations: 7\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "Confusion Matrix and Statistics\n",
- "\n",
- " Reference\n",
- "Prediction 0 1\n",
- " 0 153 22\n",
- " 1 10 65\n",
- " \n",
- " Accuracy : 0.872 \n",
- " 95% CI : (0.8241, 0.9108)\n",
- " No Information Rate : 0.652 \n",
- " P-Value [Acc > NIR] : 2.437e-15 \n",
- " \n",
- " Kappa : 0.7086 \n",
- " \n",
- " Mcnemar's Test P-Value : 0.05183 \n",
- " \n",
- " Sensitivity : 0.9387 \n",
- " Specificity : 0.7471 \n",
- " Pos Pred Value : 0.8743 \n",
- " Neg Pred Value : 0.8667 \n",
- " Prevalence : 0.6520 \n",
- " Detection Rate : 0.6120 \n",
- " Detection Prevalence : 0.7000 \n",
- " Balanced Accuracy : 0.8429 \n",
- " \n",
- " 'Positive' Class : 0 \n",
- " "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "model <- glm(Y ~., family = binomial(link = \"logit\"), data = test)\n",
- "summary(model)\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "p <- predict(model, test[,c(1:2)])\n",
- "\n",
- "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(test[[\"Y\"]])))\n",
- "\n",
- "confusionMatrix(p_class, factor(test[[\"Y\"]]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "R",
- "language": "R",
- "name": "ir"
- },
- "language_info": {
- "codemirror_mode": "r",
- "file_extension": ".r",
- "mimetype": "text/x-r-source",
- "name": "R",
- "pygments_lexer": "r",
- "version": "3.4.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement