Guest User

Untitled

a guest
Jun 20th, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.47 KB | None | 0 0
  1. # The purpose of this code is to predict and assign survival
  2. # values to Titanic passengers with unknown survival statistics
  3. # based on data analysis.
  4.  
  5. # Set my working directory to one where my files are stored.
  6. setwd("//nlamsvfls07/userdata2$/LWeston/Burberry Info/Data Fellowship")
  7. library(readr)
  8.  
  9. # Import my test and training datasets and save them in variables
  10. train <- read.csv("~/Burberry Info/Data Fellowship/train.csv")
  11. View(train)
  12. test <- read.csv("~/Burberry Info/Data Fellowship/test.csv")
  13. View(test)
  14.  
  15. # Check out what data types are included
  16. str(train)
  17.  
  18. # Convert Pclass and Survived to factors in the training data
  19. train$Pclass <- as.factor(train$Pclass)
  20. train$Survived <- as.factor(train$Survived)
  21.  
  22. # Convert names to strings in the training data
  23. train$Name <- as.character(train$Name)
  24.  
  25. # Check out what data types are included in test
  26. str(test)
  27.  
  28. # Convert Pclass to factors in the test data
  29. test$Pclass <- as.factor(test$Pclass)
  30.  
  31. # Convert names to strings in the test data
  32. test$Name <- as.character(test$Name)
  33.  
  34. #Check for duplicate entries
  35. length(unique(train$Name))
  36.  
  37. # Explore the train data a bit
  38. table(train$Survived)
  39.  
  40. #Look at proportions
  41. prop.table(table(train$Survived))
  42.  
  43. # Load ggplot2
  44. library(ggplot2)
  45. ggplot(train, aes(x = Sex, fill = factor(Survived))) +
  46. geom_bar(width = 0.5) +
  47. xlab("Sex") +
  48. ylab("Total count") +
  49. labs(fill = "Survived")
  50.  
  51. #Add new column in train data
  52. train$ModelPrediction <- "TBD"
  53. head(train)
  54.  
  55. #Add value to new column in train data
  56. train$ModelPrediction <- 0
  57. head(train)
  58.  
  59. #Check structure
  60. str(train$ModelPrediction)
  61.  
  62. #count survival stats accuracy
  63. sum(train$ModelPrediction == train$Survived)
  64.  
  65. #count survival stats accuracy percentage
  66. sum(train$ModelPrediction == train$Survived)/nrow(train)
  67.  
  68. #reset ModelPrediction column to "TBD"
  69. train$ModelPrediction <- "TBD"
  70.  
  71. #predict that males don't survive
  72. train$ModelPrediction[train$Sex == "male"] <- 0
  73.  
  74. #predict that females do survive
  75. train$ModelPrediction[train$Sex == "female"] <- 1
  76.  
  77. #count survival stats accuracy percentage
  78. sum(train$ModelPrediction == train$Survived)/nrow(train)
  79.  
  80. #introducing a second variable (pclass) to our model's predictions
  81. table(train$Pclass, train$Survived)
  82.  
  83. #compare the proportions by pclass
  84. prop.table(table(train$Pclass, train$Survived))
  85.  
  86. #plot Pclass against survival and sex data
  87. ggplot(train, aes(x = Sex, fill = Survived)) +
  88. geom_bar(width = 0.5) +
  89. facet_wrap(~Pclass) +
  90. ggtitle("Pclass") +
  91. xlab("Sex") +
  92. ylab("Total count") +
  93. labs(fill = "Survived")
  94.  
  95. #Reset ModelPrediction column in train dataset to 0
  96. train$ModelPrediction <- 0
  97. head(train)
  98.  
  99. #Set that all females survive, unless they are in pclass 3
  100. train$ModelPrediction[train$Sex == "female"] <- 1
  101. train$ModelPrediction[train$Pclass == "3"] <- 0
  102. head(train)
  103.  
  104. #count survival stats accuracy percentage
  105. sum(train$ModelPrediction == train$Survived)/nrow(train)
  106.  
  107. #plot to show relationship between Pclass, Sex, Age, and Survival
  108. ggplot(train, aes(x = Age, fill = Survived)) +
  109. facet_wrap(~Sex + Pclass) +
  110. geom_bar(width = 10) +
  111. xlab("Age") +
  112. ylab("Total count")
  113.  
  114. #Reset ModelPrediction column in train dataset to 0
  115. train$ModelPrediction <- 0
  116. head(train)
  117.  
  118. #Set that all females survive, unless they are in pclass 3
  119. train$ModelPrediction[train$Sex == "female"] <- 1
  120. train$ModelPrediction[train$Pclass == "3"] <- 0
  121. head(train)
  122.  
  123. #predict that all males under 18 in first and second class survive
  124. train$ModelPrediction[(train$Pclass == "1" | train$Pclass == "2") & train$Sex == "male" & train$Age < 18] <- 1
  125. head(train)
  126.  
  127. #count survival stats accuracy percentage
  128. sum(train$ModelPrediction == train$Survived)/nrow(train)
  129.  
  130. #create new data frame called "submission" for passenger id data
  131. submission <- data.frame(test$PassengerId)
  132. head(submission)
  133.  
  134. #rename header as "Passenger Id"
  135. names(submission)[names(submission) == "test.PassengerId"] <- "PassengerId"
  136. head(submission)
  137.  
  138. #create a column called Survived that assumes everyone perished.
  139. test$Survived <- 0
  140. head(test)
  141.  
  142. #say all females survive
  143. test$Survived[test$Sex == "female"] <- 1
  144. head(test)
  145.  
  146. #assume all females in third class perish
  147. test$Survived[test$Pclass == "3"] <- 0
  148.  
  149. #assume all males in first and second class under 18 survived
  150. test$Survived[(test$Pclass == "1" | test$Pclass == "2") & test$Sex == "male" & test$Age < 18] <- 1
  151. View(test)
  152.  
  153. #Copy values from ModelPrediction in test dataset into submission under a new column called "Survived"
  154. submission$Survived <- test$Survived
  155. View(test)
  156. head(submission)
  157.  
  158. #Export submission to a csv file
  159. write.csv(submission, file = "titanic_in_r_submission.csv")
Add Comment
Please, Sign In to add comment