Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # The purpose of this code is to predict and assign survival
- # values to Titanic passengers with unknown survival statistics
- # based on data analysis.
- # Set my working directory to one where my files are stored.
- setwd("//nlamsvfls07/userdata2$/LWeston/Burberry Info/Data Fellowship")
- library(readr)
- # Import my test and training datasets and save them in variables
- train <- read.csv("~/Burberry Info/Data Fellowship/train.csv")
- View(train)
- test <- read.csv("~/Burberry Info/Data Fellowship/test.csv")
- View(test)
- # Check out what data types are included
- str(train)
- # Convert Pclass and Survived to factors in the training data
- train$Pclass <- as.factor(train$Pclass)
- train$Survived <- as.factor(train$Survived)
- # Convert names to strings in the training data
- train$Name <- as.character(train$Name)
- # Check out what data types are included in test
- str(test)
- # Convert Pclass to factors in the test data
- test$Pclass <- as.factor(test$Pclass)
- # Convert names to strings in the test data
- test$Name <- as.character(test$Name)
- #Check for duplicate entries
- length(unique(train$Name))
- # Explore the train data a bit
- table(train$Survived)
- #Look at proportions
- prop.table(table(train$Survived))
- # Load ggplot2
- library(ggplot2)
- ggplot(train, aes(x = Sex, fill = factor(Survived))) +
- geom_bar(width = 0.5) +
- xlab("Sex") +
- ylab("Total count") +
- labs(fill = "Survived")
- #Add new column in train data
- train$ModelPrediction <- "TBD"
- head(train)
- #Add value to new column in train data
- train$ModelPrediction <- 0
- head(train)
- #Check structure
- str(train$ModelPrediction)
- #count survival stats accuracy
- sum(train$ModelPrediction == train$Survived)
- #count survival stats accuracy percentage
- sum(train$ModelPrediction == train$Survived)/nrow(train)
- #reset ModelPrediction column to "TBD"
- train$ModelPrediction <- "TBD"
- #predict that males don't survive
- train$ModelPrediction[train$Sex == "male"] <- 0
- #predict that females do survive
- train$ModelPrediction[train$Sex == "female"] <- 1
- #count survival stats accuracy percentage
- sum(train$ModelPrediction == train$Survived)/nrow(train)
- #introducing a second variable (pclass) to our model's predictions
- table(train$Pclass, train$Survived)
- #compare the proportions by pclass
- prop.table(table(train$Pclass, train$Survived))
- #plot Pclass against survival and sex data
- ggplot(train, aes(x = Sex, fill = Survived)) +
- geom_bar(width = 0.5) +
- facet_wrap(~Pclass) +
- ggtitle("Pclass") +
- xlab("Sex") +
- ylab("Total count") +
- labs(fill = "Survived")
- #Reset ModelPrediction column in train dataset to 0
- train$ModelPrediction <- 0
- head(train)
- #Set that all females survive, unless they are in pclass 3
- train$ModelPrediction[train$Sex == "female"] <- 1
- train$ModelPrediction[train$Pclass == "3"] <- 0
- head(train)
- #count survival stats accuracy percentage
- sum(train$ModelPrediction == train$Survived)/nrow(train)
- #plot to show relationship between Pclass, Sex, Age, and Survival
- ggplot(train, aes(x = Age, fill = Survived)) +
- facet_wrap(~Sex + Pclass) +
- geom_bar(width = 10) +
- xlab("Age") +
- ylab("Total count")
- #Reset ModelPrediction column in train dataset to 0
- train$ModelPrediction <- 0
- head(train)
- #Set that all females survive, unless they are in pclass 3
- train$ModelPrediction[train$Sex == "female"] <- 1
- train$ModelPrediction[train$Pclass == "3"] <- 0
- head(train)
- #predict that all males under 18 in first and second class survive
- train$ModelPrediction[(train$Pclass == "1" | train$Pclass == "2") & train$Sex == "male" & train$Age < 18] <- 1
- head(train)
- #count survival stats accuracy percentage
- sum(train$ModelPrediction == train$Survived)/nrow(train)
- #create new data frame called "submission" for passenger id data
- submission <- data.frame(test$PassengerId)
- head(submission)
- #rename header as "Passenger Id"
- names(submission)[names(submission) == "test.PassengerId"] <- "PassengerId"
- head(submission)
- #create a column called Survived that assumes everyone perished.
- test$Survived <- 0
- head(test)
- #say all females survive
- test$Survived[test$Sex == "female"] <- 1
- head(test)
- #assume all females in third class perish
- test$Survived[test$Pclass == "3"] <- 0
- #assume all males in first and second class under 18 survived
- test$Survived[(test$Pclass == "1" | test$Pclass == "2") & test$Sex == "male" & test$Age < 18] <- 1
- View(test)
- #Copy values from ModelPrediction in test dataset into submission under a new column called "Survived"
- submission$Survived <- test$Survived
- View(test)
- head(submission)
- #Export submission to a csv file
- write.csv(submission, file = "titanic_in_r_submission.csv")
Add Comment
Please, Sign In to add comment