Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ########loading the Titanic Train Data Set
- TitanicTrain<-train1
- ######Checking Missing Values in the Train Data Set
- sapply(TitanicTrain, function(x)sum(is.na(x)))
- #######Loading the Titanic Test Data Set
- TitanicTest<-test11
- #######Checking Missing Values in the Test Data Set
- sapply(TitanicTest, function(x)sum(is.na(x)))
- #######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
- TitanicTest$Survived<-NA
- ###Merging the Two Data Sets
- TitanicTrain<-rbind(TitanicTrain,TitanicTest)
- #######Making Sex a Factor Variable
- TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)
- ########Extracting the Titles from the Name Column
- TitanicTrain$Title <- gsub('(.*, )|(\\..*)', '', TitanicTrain$Name)
- ###########Fixing the Missing Values in the Variable "Age" with the Median
- ROWS<-which(is.na(TitanicTrain$Age))
- MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
- TitanicTrain$Age<-as.character(TitanicTrain$Age)
- TitanicTrain[ROWS,"Age"]<-MedianAge
- ###########Fixing the Missing Values in Variable "Embarked"
- ROWS2<-which(is.na(TitanicTrain$Embarked))
- TitanicTrain[ROWS2,"Embarked"]<-"S"
- ##########Fixing the Missing Values in Variable "Fare"
- Rows3<-which(is.na(TitanicTrain$Fare))
- TitanicTrain[Rows3,"Fare"]<-14.45
- ###########Converting the types of Variables to correct form
- TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
- TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
- TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
- TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
- TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
- TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
- TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
- TitanicTrain$Title<-as.factor(TitanicTrain$Title)
- #########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
- TitanicTrain$Ticket<-NULL
- TitanicTrain$Cabin<-NULL
- TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles
- ########Converting the prepared data to a Data Frame
- TitanicTrain<-data.frame(TitanicTrain)
- #######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
- TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
- Train<-subset(TitanicTrain,PassengerId<892)
- Test<-subset(TitanicTrain,PassengerId>=892)
- #############Separating the Independent (x) and the dependant ("Survived") variables
- y<-as.numeric(Train[,2])-1
- x<-data.frame(Train[,3:10])
- Test$Survived<-NULL
- xtest<-data.frame(Test)
- xtest$PassengerId<-NULL
- ########Finally checking Structures for all the created data frames
- str(y)
- str(x)
- str(xtest)
- ###########Loading the SuperLearner Library
- library(SuperLearner)
- #############Training the Model using SuperLearner Library (Ensemble Modelling)
- single.model2 <- SuperLearner(y,
- x,
- family=binomial(),
- SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
- "SL.glmnet","SL.randomForest"))
- ###########Printing the Model to check the Risk Estimates/Error
- print(single.model2)
- ##########Making Predictions on the Test Data
- predictions3<-predict.SuperLearner(single.model2,xtest)
- ###########Observing the frequency distribution of the Predictions
- hist(predictions3$pred)
- #########Converting the Predictions to Binaries
- predictions4<-ifelse(predictions3$pred>=0.73,1,0)
- #############Creating a CV File with the Predictions
- write.csv(predictions4,"Predictions.csv")
Add Comment
Please, Sign In to add comment