Advertisement
Guest User

Untitled

a guest
May 20th, 2019
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.33 KB | None | 0 0
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "df <- read.csv(\"logistic.csv\")\n",
  10. "#head(df, 15)\n"
  11. ]
  12. },
  13. {
  14. "cell_type": "code",
  15. "execution_count": 2,
  16. "metadata": {},
  17. "outputs": [],
  18. "source": [
  19. "## Splitting the dataset\n",
  20. "\n",
  21. "n_observations <- nrow(df)\n",
  22. "perm_rows <- sample(n_observations)\n",
  23. "df_shuffled <- df[perm_rows, ]\n",
  24. "split <- round(n_observations* 0.75)\n",
  25. "train <- df_shuffled[1:split, ]\n",
  26. "test <- df_shuffled[(split +1): n_observations, ]\n",
  27. "\n",
  28. "X_train <- as.matrix(train[, c(1:2)])\n",
  29. "Y_train <- as.matrix(train[, c(3)])\n",
  30. "X_test <- as.matrix(test[ ,c(1:2)]) \n",
  31. "Y_test <- as.matrix(test[,c(3)])"
  32. ]
  33. },
  34. {
  35. "cell_type": "code",
  36. "execution_count": 3,
  37. "metadata": {},
  38. "outputs": [],
  39. "source": [
  40. "###Sigmoid function\n",
  41. "\n",
  42. "sigmoid <- function(x) {sig <- 1/(1+exp(-x)) \n",
  43. " return(sig)}\n",
  44. " \n",
  45. "\n",
  46. "## Logistic Loss Function: \n",
  47. "cost_function<- function(t){\n",
  48. " ## Taking in the x values with num rows\n",
  49. " instances <- nrow(X)\n",
  50. " ## Defining the output of the sigmoid function by X matrix matmul theta \n",
  51. " sig <- sigmoid(X%*%t)\n",
  52. " \n",
  53. " ## Approximating the cost function -1/m[E y(i)logh0(x(i)) + (1- y(i))log(1-h0(x(i)))]\n",
  54. " c <- (1/instances) * sum((-Y * log(sig)) - ((1-Y) * log(1-sig)))\n",
  55. " return(c)\n",
  56. " \n",
  57. "}\n",
  58. "\n",
  59. "## Applying the logistic regression\n",
  60. "logit <- function(X,Y, sigmoid, cost, fit = TRUE, optimized_thetas = None, prob_return = TRUE, threshold = 0.5){\n",
  61. " ## This library is going to be used for my accuracy assessments\n",
  62. " library(\"caret\")\n",
  63. " ## This will be used if we are fitting to the dataset\n",
  64. " if(fit == TRUE){\n",
  65. " ## Initializing theta\n",
  66. " init_theta <- rep(0, ncol(X))\n",
  67. " print(\"Initial cost with intial Thetas: \")\n",
  68. " ## Declaring X and Y as global variable \n",
  69. " X <<- X\n",
  70. " Y <<- Y\n",
  71. " ## The initial cost of the model with unoptimized theta\n",
  72. " print(cost_function(init_theta))\n",
  73. " ## Optimizing theta to reduce overall cost \n",
  74. " optimized_theta <-solve(t(X) %*% X) %*% t(X) %*% Y ## The solve function is meant to solve for the inverse of the matrix resulting from transpose(x) * x, then matmul out the inverse by x by y. the t function is simply transposing the matrix \n",
  75. " print(\"Optimized Parameters(Thetas): \")\n",
  76. " print(optimized_theta)\n",
  77. " print(\"Optimized Cost: \")\n",
  78. " print(cost_function(optimized_theta))\n",
  79. " return(optimized_theta)\n",
  80. " }else{\n",
  81. " ## calculating the probabilities of the predictions by matrix multiplication of X with the optimized thetas\n",
  82. " probs <- sigmoid(X %*% optimized_thetas)\n",
  83. " ## Creating a vector from the probabilites\n",
  84. "\n",
  85. " probs <- c(probs)\n",
  86. " ## Creating an empty vector\n",
  87. " classes <- vector()\n",
  88. " for (value in probs){\n",
  89. " ## Appending the class as one if the prob value is above threshold\n",
  90. " if (value > threshold){\n",
  91. " classes <- c(classes, c(1))\n",
  92. " #append(classes, c(1))\n",
  93. " ## Reverse is true \n",
  94. " }else{\n",
  95. " classes <- c(classes, c(0))\n",
  96. " #append(classes, c(0))\n",
  97. " } \n",
  98. " \n",
  99. " }\n",
  100. " ## printing my accuracy metrics\n",
  101. " print(confusionMatrix(as.factor(c(classes)),as.factor(c(Y))))\n",
  102. " ## If else depending on if you want probabilites or predictions\n",
  103. " if(prob_return == TRUE){\n",
  104. " return(c(probs))}else{\n",
  105. " return(classes) \n",
  106. " }\n",
  107. " \n",
  108. " }\n",
  109. "\n",
  110. "}"
  111. ]
  112. },
  113. {
  114. "cell_type": "code",
  115. "execution_count": 4,
  116. "metadata": {},
  117. "outputs": [
  118. {
  119. "name": "stderr",
  120. "output_type": "stream",
  121. "text": [
  122. "Loading required package: lattice\n",
  123. "Loading required package: ggplot2\n"
  124. ]
  125. },
  126. {
  127. "name": "stdout",
  128. "output_type": "stream",
  129. "text": [
  130. "[1] \"Initial cost with intial Thetas: \"\n",
  131. "[1] 0.6931472\n",
  132. "[1] \"Optimized Parameters(Thetas): \"\n",
  133. " [,1]\n",
  134. "X1 0.02713837\n",
  135. "X2 0.34743736\n",
  136. "[1] \"Optimized Cost: \"\n",
  137. "[1] 0.5936758\n"
  138. ]
  139. }
  140. ],
  141. "source": [
  142. "## Training and opimizing the data. \n",
  143. "optimized_thetas <- logit(X_train,Y_train, sigmoid, cost_function)\n"
  144. ]
  145. },
  146. {
  147. "cell_type": "code",
  148. "execution_count": 5,
  149. "metadata": {},
  150. "outputs": [
  151. {
  152. "name": "stdout",
  153. "output_type": "stream",
  154. "text": [
  155. "Confusion Matrix and Statistics\n",
  156. "\n",
  157. " Reference\n",
  158. "Prediction 0 1\n",
  159. " 0 368 18\n",
  160. " 1 107 257\n",
  161. " \n",
  162. " Accuracy : 0.8333 \n",
  163. " 95% CI : (0.8047, 0.8593)\n",
  164. " No Information Rate : 0.6333 \n",
  165. " P-Value [Acc > NIR] : < 2.2e-16 \n",
  166. " \n",
  167. " Kappa : 0.664 \n",
  168. " \n",
  169. " Mcnemar's Test P-Value : 3.519e-15 \n",
  170. " \n",
  171. " Sensitivity : 0.7747 \n",
  172. " Specificity : 0.9345 \n",
  173. " Pos Pred Value : 0.9534 \n",
  174. " Neg Pred Value : 0.7060 \n",
  175. " Prevalence : 0.6333 \n",
  176. " Detection Rate : 0.4907 \n",
  177. " Detection Prevalence : 0.5147 \n",
  178. " Balanced Accuracy : 0.8546 \n",
  179. " \n",
  180. " 'Positive' Class : 0 \n",
  181. " \n"
  182. ]
  183. }
  184. ],
  185. "source": [
  186. "## Class predictions with the training set\n",
  187. "train_set <-logit(X_train,Y_train, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE)"
  188. ]
  189. },
  190. {
  191. "cell_type": "code",
  192. "execution_count": 6,
  193. "metadata": {},
  194. "outputs": [
  195. {
  196. "name": "stdout",
  197. "output_type": "stream",
  198. "text": [
  199. "Confusion Matrix and Statistics\n",
  200. "\n",
  201. " Reference\n",
  202. "Prediction 0 1\n",
  203. " 0 125 6\n",
  204. " 1 38 81\n",
  205. " \n",
  206. " Accuracy : 0.824 \n",
  207. " 95% CI : (0.771, 0.8691)\n",
  208. " No Information Rate : 0.652 \n",
  209. " P-Value [Acc > NIR] : 1.309e-09 \n",
  210. " \n",
  211. " Kappa : 0.6428 \n",
  212. " \n",
  213. " Mcnemar's Test P-Value : 2.962e-06 \n",
  214. " \n",
  215. " Sensitivity : 0.7669 \n",
  216. " Specificity : 0.9310 \n",
  217. " Pos Pred Value : 0.9542 \n",
  218. " Neg Pred Value : 0.6807 \n",
  219. " Prevalence : 0.6520 \n",
  220. " Detection Rate : 0.5000 \n",
  221. " Detection Prevalence : 0.5240 \n",
  222. " Balanced Accuracy : 0.8490 \n",
  223. " \n",
  224. " 'Positive' Class : 0 \n",
  225. " \n"
  226. ]
  227. }
  228. ],
  229. "source": [
  230. "## Class predictions with the test set\n",
  231. "test_set <- logit(X_test,Y_test, sigmoid, cost_function, fit = FALSE, optimized_thetas = optimized_thetas, prob_return = FALSE, threshold = 0.5)"
  232. ]
  233. },
  234. {
  235. "cell_type": "code",
  236. "execution_count": 7,
  237. "metadata": {},
  238. "outputs": [
  239. {
  240. "name": "stdout",
  241. "output_type": "stream",
  242. "text": [
  243. "[1] \"Train Set Predictions head: \"\n",
  244. "[1] 0 0 0 1 0 1\n",
  245. "[1] \"Test Set Predictions head\"\n",
  246. "[1] 0 1 1 0 0 1\n"
  247. ]
  248. }
  249. ],
  250. "source": [
  251. "print(\"Train Set Predictions head: \")\n",
  252. "print(head(train_set))\n",
  253. "print(\"Test Set Predictions head\")\n",
  254. "print(head(test_set))"
  255. ]
  256. },
  257. {
  258. "cell_type": "code",
  259. "execution_count": 49,
  260. "metadata": {},
  261. "outputs": [
  262. {
  263. "data": {
  264. "text/plain": [
  265. "\n",
  266. "Call:\n",
  267. "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = train)\n",
  268. "\n",
  269. "Deviance Residuals: \n",
  270. " Min 1Q Median 3Q Max \n",
  271. "-3.1716 -0.4111 -0.1143 0.3058 2.7664 \n",
  272. "\n",
  273. "Coefficients:\n",
  274. " Estimate Std. Error z value Pr(>|z|) \n",
  275. "(Intercept) -1.2758 0.1410 -9.050 <2e-16 ***\n",
  276. "X1 0.3090 0.1174 2.632 0.0085 ** \n",
  277. "X2 3.7959 0.2902 13.082 <2e-16 ***\n",
  278. "---\n",
  279. "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
  280. "\n",
  281. "(Dispersion parameter for binomial family taken to be 1)\n",
  282. "\n",
  283. " Null deviance: 985.74 on 749 degrees of freedom\n",
  284. "Residual deviance: 444.08 on 747 degrees of freedom\n",
  285. "AIC: 450.08\n",
  286. "\n",
  287. "Number of Fisher Scoring iterations: 6\n"
  288. ]
  289. },
  290. "metadata": {},
  291. "output_type": "display_data"
  292. },
  293. {
  294. "data": {
  295. "text/plain": [
  296. "Confusion Matrix and Statistics\n",
  297. "\n",
  298. " Reference\n",
  299. "Prediction 0 1\n",
  300. " 0 447 81\n",
  301. " 1 28 194\n",
  302. " \n",
  303. " Accuracy : 0.8547 \n",
  304. " 95% CI : (0.8274, 0.8791)\n",
  305. " No Information Rate : 0.6333 \n",
  306. " P-Value [Acc > NIR] : < 2.2e-16 \n",
  307. " \n",
  308. " Kappa : 0.6738 \n",
  309. " \n",
  310. " Mcnemar's Test P-Value : 6.336e-07 \n",
  311. " \n",
  312. " Sensitivity : 0.9411 \n",
  313. " Specificity : 0.7055 \n",
  314. " Pos Pred Value : 0.8466 \n",
  315. " Neg Pred Value : 0.8739 \n",
  316. " Prevalence : 0.6333 \n",
  317. " Detection Rate : 0.5960 \n",
  318. " Detection Prevalence : 0.7040 \n",
  319. " Balanced Accuracy : 0.8233 \n",
  320. " \n",
  321. " 'Positive' Class : 0 \n",
  322. " "
  323. ]
  324. },
  325. "metadata": {},
  326. "output_type": "display_data"
  327. }
  328. ],
  329. "source": [
  330. "## Using glm logistic regression\n",
  331. "#train\n",
  332. "model <- glm(Y ~., family = binomial(link = \"logit\"), data = train)\n",
  333. "summary(model)\n",
  334. "\n",
  335. "p <- predict(model, train[,c(1:2)])\n",
  336. "\n",
  337. "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(train[[\"Y\"]])))\n",
  338. "\n",
  339. "confusionMatrix(p_class, factor(train[[\"Y\"]]))"
  340. ]
  341. },
  342. {
  343. "cell_type": "code",
  344. "execution_count": 50,
  345. "metadata": {},
  346. "outputs": [
  347. {
  348. "data": {
  349. "text/plain": [
  350. "\n",
  351. "Call:\n",
  352. "glm(formula = Y ~ ., family = binomial(link = \"logit\"), data = test)\n",
  353. "\n",
  354. "Deviance Residuals: \n",
  355. " Min 1Q Median 3Q Max \n",
  356. "-2.45433 -0.37074 -0.07632 0.28170 2.14015 \n",
  357. "\n",
  358. "Coefficients:\n",
  359. " Estimate Std. Error z value Pr(>|z|) \n",
  360. "(Intercept) -1.3891 0.2666 -5.210 1.89e-07 ***\n",
  361. "X1 0.3998 0.2248 1.778 0.0753 . \n",
  362. "X2 3.7630 0.5117 7.353 1.93e-13 ***\n",
  363. "---\n",
  364. "Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1\n",
  365. "\n",
  366. "(Dispersion parameter for binomial family taken to be 1)\n",
  367. "\n",
  368. " Null deviance: 323.1 on 249 degrees of freedom\n",
  369. "Residual deviance: 136.9 on 247 degrees of freedom\n",
  370. "AIC: 142.9\n",
  371. "\n",
  372. "Number of Fisher Scoring iterations: 7\n"
  373. ]
  374. },
  375. "metadata": {},
  376. "output_type": "display_data"
  377. },
  378. {
  379. "data": {
  380. "text/plain": [
  381. "Confusion Matrix and Statistics\n",
  382. "\n",
  383. " Reference\n",
  384. "Prediction 0 1\n",
  385. " 0 153 22\n",
  386. " 1 10 65\n",
  387. " \n",
  388. " Accuracy : 0.872 \n",
  389. " 95% CI : (0.8241, 0.9108)\n",
  390. " No Information Rate : 0.652 \n",
  391. " P-Value [Acc > NIR] : 2.437e-15 \n",
  392. " \n",
  393. " Kappa : 0.7086 \n",
  394. " \n",
  395. " Mcnemar's Test P-Value : 0.05183 \n",
  396. " \n",
  397. " Sensitivity : 0.9387 \n",
  398. " Specificity : 0.7471 \n",
  399. " Pos Pred Value : 0.8743 \n",
  400. " Neg Pred Value : 0.8667 \n",
  401. " Prevalence : 0.6520 \n",
  402. " Detection Rate : 0.6120 \n",
  403. " Detection Prevalence : 0.7000 \n",
  404. " Balanced Accuracy : 0.8429 \n",
  405. " \n",
  406. " 'Positive' Class : 0 \n",
  407. " "
  408. ]
  409. },
  410. "metadata": {},
  411. "output_type": "display_data"
  412. }
  413. ],
  414. "source": [
  415. "model <- glm(Y ~., family = binomial(link = \"logit\"), data = test)\n",
  416. "summary(model)\n",
  417. "\n",
  418. "\n",
  419. "\n",
  420. "\n",
  421. "\n",
  422. "p <- predict(model, test[,c(1:2)])\n",
  423. "\n",
  424. "p_class <- factor(ifelse(p > 0.5, 1, 0), levels = levels(factor(test[[\"Y\"]])))\n",
  425. "\n",
  426. "confusionMatrix(p_class, factor(test[[\"Y\"]]))"
  427. ]
  428. },
  429. {
  430. "cell_type": "code",
  431. "execution_count": null,
  432. "metadata": {},
  433. "outputs": [],
  434. "source": [
  435. "\n"
  436. ]
  437. },
  438. {
  439. "cell_type": "code",
  440. "execution_count": null,
  441. "metadata": {},
  442. "outputs": [],
  443. "source": []
  444. }
  445. ],
  446. "metadata": {
  447. "kernelspec": {
  448. "display_name": "R",
  449. "language": "R",
  450. "name": "ir"
  451. },
  452. "language_info": {
  453. "codemirror_mode": "r",
  454. "file_extension": ".r",
  455. "mimetype": "text/x-r-source",
  456. "name": "R",
  457. "pygments_lexer": "r",
  458. "version": "3.4.4"
  459. }
  460. },
  461. "nbformat": 4,
  462. "nbformat_minor": 2
  463. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement