Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(dplyr)
- ######################################
- # 1. Load Data
- ######################################
- # Set Working Directory
- setwd("/Users/oindrilasen/WORK_AREA/Data Science/kaggle/Titanic")
- # Read train.csv data file
- titanic_clean<-read.csv("train.csv",
- header = TRUE,
- na.strings = "",
- stringsAsFactors = FALSE)
- glimpse(titanic_clean)
- ######################################
- # 2. Data Wrangling and Cleaning
- ######################################
- # Check which variables are Factors
- sapply(titanic_clean, function(x) length(unique(x)))
- # Transforming categorical Variables to factors:
- to_factor <- c(
- 'Survived',
- 'Pclass',
- 'Sex',
- 'Embarked'
- )
- for (col in to_factor) {
- titanic_clean[[col]] <- factor(titanic_clean[[col]])
- }
- # check for NA values
- sapply(titanic_clean, function(x) sum(is.na(x)))
- # Convert Age column to Numeric
- titanic_clean$Age <- as.integer((titanic_clean$Age))
- # Relace NA values for Age with the Median
- titanic_clean$Age[is.na(titanic_clean$Age)] <- mean(titanic_clean$Age,na.rm = TRUE)
- # Replace Cabin# with None for NA records
- titanic_clean$Cabin[is.na(titanic_clean$Cabin)] <- "None"
- # Check for Embarked variable
- table(titanic_clean$Embarked)
- # Relace the Embarked value with the most common value i.e S
- titanic_clean$Embarked[is.na(titanic_clean$Embarked)] <- "S"
- # Convert Fare column to Integer
- titanic_clean$Fare <- as.integer((titanic_clean$Fare))
- # Relace NA values for Age with the Median
- titanic_clean$Fare[is.na(titanic_clean$Fare)] <- median(titanic_clean$Fare,na.rm = TRUE)
- # Again check for NA values
- sapply(titanic_clean, function(x) sum(is.na(x)))
- # Change the levels to meaningful values
- # 1. Pclass
- levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "1"] <- "1st Class"
- levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "2"] <- "2nd Class"
- levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "3"] <- "3rd Class"
- # 2. Embarked
- levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "C"] <- "Cherbourg"
- levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "Q"] <- "Queenstown"
- levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "S"] <- "Southampton"
- ######################################
- # 3. Adding New Features
- ######################################
- # Add new feature Fare_Group
- summary(titanic_clean$Fare)
- titanic_clean$Fare_Group <-factor(ifelse(titanic_clean$Fare >= 0 & titanic_clean$Fare <= 15, "Low",
- ifelse(titanic_clean$Fare > 15 & titanic_clean$Fare <=100, "Medium",
- ifelse(titanic_clean$Fare >100 ,"High",NA
- ))))
- # Add new feature Age_Group
- summary(titanic_clean$Age)
- titanic_clean$Age_Group <-factor(ifelse(titanic_clean$Age<= 3, "Baby",
- ifelse(titanic_clean$Age> 3 & titanic_clean$Age<=12, "Kid",
- ifelse(titanic_clean$Age> 12 & titanic_clean$Age<=18, "Teen",
- ifelse(titanic_clean$Age> 18, "Adult",NA
- ))))
- )
- # Add new feature with_family
- titanic_clean$with_family <-factor(ifelse(titanic_clean$Parch == 0 & titanic_clean$SibSp ==0, "no","yes"))
- View(titanic_clean)
Add Comment
Please, Sign In to add comment