Guest User

Untitled

a guest
Jan 22nd, 2018
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.26 KB | None | 0 0
  1. library(dplyr)
  2. ######################################
  3. # 1. Load Data
  4. ######################################
  5. # Set Working Directory
  6. setwd("/Users/oindrilasen/WORK_AREA/Data Science/kaggle/Titanic")
  7. # Read train.csv data file
  8. titanic_clean<-read.csv("train.csv",
  9. header = TRUE,
  10. na.strings = "",
  11. stringsAsFactors = FALSE)
  12.  
  13. glimpse(titanic_clean)
  14. ######################################
  15. # 2. Data Wrangling and Cleaning
  16. ######################################
  17. # Check which variables are Factors
  18. sapply(titanic_clean, function(x) length(unique(x)))
  19. # Transforming categorical Variables to factors:
  20. to_factor <- c(
  21. 'Survived',
  22. 'Pclass',
  23. 'Sex',
  24. 'Embarked'
  25. )
  26. for (col in to_factor) {
  27. titanic_clean[[col]] <- factor(titanic_clean[[col]])
  28. }
  29. # check for NA values
  30. sapply(titanic_clean, function(x) sum(is.na(x)))
  31.  
  32. # Convert Age column to Numeric
  33. titanic_clean$Age <- as.integer((titanic_clean$Age))
  34. # Relace NA values for Age with the Median
  35. titanic_clean$Age[is.na(titanic_clean$Age)] <- mean(titanic_clean$Age,na.rm = TRUE)
  36. # Replace Cabin# with None for NA records
  37. titanic_clean$Cabin[is.na(titanic_clean$Cabin)] <- "None"
  38. # Check for Embarked variable
  39. table(titanic_clean$Embarked)
  40. # Relace the Embarked value with the most common value i.e S
  41. titanic_clean$Embarked[is.na(titanic_clean$Embarked)] <- "S"
  42. # Convert Fare column to Integer
  43. titanic_clean$Fare <- as.integer((titanic_clean$Fare))
  44. # Relace NA values for Age with the Median
  45. titanic_clean$Fare[is.na(titanic_clean$Fare)] <- median(titanic_clean$Fare,na.rm = TRUE)
  46. # Again check for NA values
  47. sapply(titanic_clean, function(x) sum(is.na(x)))
  48.  
  49. # Change the levels to meaningful values
  50. # 1. Pclass
  51. levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "1"] <- "1st Class"
  52. levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "2"] <- "2nd Class"
  53. levels(titanic_clean$Pclass)[levels(titanic_clean$Pclass)== "3"] <- "3rd Class"
  54.  
  55. # 2. Embarked
  56. levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "C"] <- "Cherbourg"
  57. levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "Q"] <- "Queenstown"
  58. levels(titanic_clean$Embarked)[levels(titanic_clean$Embarked)== "S"] <- "Southampton"
  59. ######################################
  60. # 3. Adding New Features
  61. ######################################
  62. # Add new feature Fare_Group
  63. summary(titanic_clean$Fare)
  64. titanic_clean$Fare_Group <-factor(ifelse(titanic_clean$Fare >= 0 & titanic_clean$Fare <= 15, "Low",
  65. ifelse(titanic_clean$Fare > 15 & titanic_clean$Fare <=100, "Medium",
  66. ifelse(titanic_clean$Fare >100 ,"High",NA
  67. ))))
  68.  
  69. # Add new feature Age_Group
  70. summary(titanic_clean$Age)
  71. titanic_clean$Age_Group <-factor(ifelse(titanic_clean$Age<= 3, "Baby",
  72. ifelse(titanic_clean$Age> 3 & titanic_clean$Age<=12, "Kid",
  73. ifelse(titanic_clean$Age> 12 & titanic_clean$Age<=18, "Teen",
  74. ifelse(titanic_clean$Age> 18, "Adult",NA
  75. ))))
  76. )
  77. # Add new feature with_family
  78. titanic_clean$with_family <-factor(ifelse(titanic_clean$Parch == 0 & titanic_clean$SibSp ==0, "no","yes"))
  79. View(titanic_clean)
Add Comment
Please, Sign In to add comment