Untitled

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sean Dunn IST 707 Week 2 BLT 2.6 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Question 1: How many values are missing for each variable?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the titanic data \n",
    "titanic <- read.csv(\"train.csv\", na.string = c(\"\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " [1] \"PassengerId\" \"Survived\"    \"Pclass\"      \"Name\"        \"Sex\"        \n",
      " [6] \"Age\"         \"SibSp\"       \"Parch\"       \"Ticket\"      \"Fare\"       \n",
      "[11] \"Cabin\"       \"Embarked\"   \n"
     ]
    }
   ],
   "source": [
    "print(colnames(titanic))# One way to determine variable names "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] \"PassengerId\"\n",
      "[1] \"Survived\"\n",
      "[1] \"Pclass\"\n",
      "[1] \"Name\"\n",
      "[1] \"Sex\"\n",
      "[1] \"Age\"\n",
      "[1] \"SibSp\"\n",
      "[1] \"Parch\"\n",
      "[1] \"Ticket\"\n",
      "[1] \"Fare\"\n",
      "[1] \"Cabin\"\n",
      "[1] \"Embarked\"\n"
     ]
    }
   ],
   "source": [
    "## Another way, loops - print the variable names \n",
    "for(i in 1:ncol(titanic)){\n",
    "  print(colnames(titanic[i]))\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "687"
      ],
      "text/latex": [
       "687"
      ],
      "text/markdown": [
       "687"
      ],
      "text/plain": [
       "[1] 687"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "2"
      ],
      "text/latex": [
       "2"
      ],
      "text/markdown": [
       "2"
      ],
      "text/plain": [
       "[1] 2"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Determine the length of each variable \n",
    "\n",
    "length(which(is.na(titanic$PassengerId)))\n",
    "length(which(is.na(titanic$Survived)))\n",
    "length(which(is.na(titanic$Pclass)))\n",
    "length(which(is.na(titanic$Name)))\n",
    "length(which(is.na(titanic$Sex)))\n",
    "length(which(is.na(titanic$Age)))\n",
    "length(which(is.na(titanic$SibSp)))\n",
    "length(which(is.na(titanic$Parch)))\n",
    "length(which(is.na(titanic$Ticket)))\n",
    "length(which(is.na(titanic$Fare)))\n",
    "length(which(is.na(titanic$Cabin)))\n",
    "length(which(is.na(titanic$Embarked)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 0\n",
      "[1] 687\n",
      "[1] 2\n"
     ]
    }
   ],
   "source": [
    "## Alternate method, make a loop to determine missing values \n",
    "for(i in 1:ncol(titanic)){\n",
    "  print(length(which(is.na(titanic[i]))))\n",
    "}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For all variables, except Age, Cabin, and Embarked, 0 values are missing.\n",
    "\n",
    " - **Age** : 177 missing values\n",
    " - **Cabin** : 687\n",
    " - **Embarked**: two missing values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Question 2: How do you handle the missing values in each variable?\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "0"
      ],
      "text/latex": [
       "0"
      ],
      "text/markdown": [
       "0"
      ],
      "text/plain": [
       "[1] 0"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# For Age, start by removing the values and replacing the values with the Age mean \n",
    "\n",
    "titanic$Age[is.na(titanic$Age)]<- mean(titanic$Age, na.rm = TRUE)\n",
    "length(which(is.na(titanic$Age))) # Confirm no missing values in variable Age "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we can see, for Age the result is **zero**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead><tr><th scope=col>Pclass</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>SibSp</th><th scope=col>Parch</th><th scope=col>Ticket</th><th scope=col>Fare</th><th scope=col>Embarked</th></tr></thead>\n",
       "<tbody>\n",
       "\t<tr><td>3               </td><td>male            </td><td>22              </td><td>1               </td><td>0               </td><td>A/5 21171       </td><td> 7.2500         </td><td>S               </td></tr>\n",
       "\t<tr><td>1               </td><td>female          </td><td>38              </td><td>1               </td><td>0               </td><td>PC 17599        </td><td>71.2833         </td><td>C               </td></tr>\n",
       "\t<tr><td>3               </td><td>female          </td><td>26              </td><td>0               </td><td>0               </td><td>STON/O2. 3101282</td><td> 7.9250         </td><td>S               </td></tr>\n",
       "</tbody>\n",
       "</table>\n"
      ],
      "text/latex": [
       "\\begin{tabular}{r|llllllll}\n",
       " Pclass & Sex & Age & SibSp & Parch & Ticket & Fare & Embarked\\\\\n",
       "\\hline\n",
       "\t 3                & male             & 22               & 1                & 0                & A/5 21171        &  7.2500          & S               \\\\\n",
       "\t 1                & female           & 38               & 1                & 0                & PC 17599         & 71.2833          & C               \\\\\n",
       "\t 3                & female           & 26               & 0                & 0                & STON/O2. 3101282 &  7.9250          & S               \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/markdown": [
       "\n",
       "Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | \n",
       "|---|---|---|\n",
       "| 3                | male             | 22               | 1                | 0                | A/5 21171        |  7.2500          | S                | \n",
       "| 1                | female           | 38               | 1                | 0                | PC 17599         | 71.2833          | C                | \n",
       "| 3                | female           | 26               | 0                | 0                | STON/O2. 3101282 |  7.9250          | S                | \n",
       "\n",
       "\n"
      ],
      "text/plain": [
       "  Pclass Sex    Age SibSp Parch Ticket           Fare    Embarked\n",
       "1 3      male   22  1     0     A/5 21171         7.2500 S       \n",
       "2 1      female 38  1     0     PC 17599         71.2833 C       \n",
       "3 3      female 26  0     0     STON/O2. 3101282  7.9250 S       "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Given the large number of variables missing from Cabin, we can simply not include Cabin \n",
    "# In our analysis. We can remove other variables as well which we may not need to examine\n",
    "newVars=c(\"Pclass\",  \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Ticket\", \"Fare\", \"Embarked\")\n",
    "titanic_new <- titanic[newVars]\n",
    "head(titanic_new,3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For Embarked, which is the Port of Embarkation\tC = Cherbourg, Q = Queenstown, S = Southampton, with just two missing values we can decide to ignore those missing values. "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}