Untitled

########loading the Titanic Train Data Set
TitanicTrain<-train1

######Checking Missing Values in the Train Data Set
sapply(TitanicTrain, function(x)sum(is.na(x)))

#######Loading the Titanic Test Data Set
TitanicTest<-test11

#######Checking Missing Values in the Test Data Set
sapply(TitanicTest, function(x)sum(is.na(x)))

#######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
TitanicTest$Survived<-NA

###Merging the Two Data Sets
TitanicTrain<-rbind(TitanicTrain,TitanicTest)

#######Making Sex a Factor Variable
TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)

########Extracting the Titles from the Name Column
TitanicTrain$Title <- gsub('(.*, )|(\\..*)', '', TitanicTrain$Name)

###########Fixing the Missing Values in the Variable "Age" with the Median
ROWS<-which(is.na(TitanicTrain$Age))
MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
TitanicTrain$Age<-as.character(TitanicTrain$Age)
TitanicTrain[ROWS,"Age"]<-MedianAge


###########Fixing the Missing Values in Variable "Embarked"
ROWS2<-which(is.na(TitanicTrain$Embarked))
TitanicTrain[ROWS2,"Embarked"]<-"S"

##########Fixing the Missing Values in Variable "Fare"
Rows3<-which(is.na(TitanicTrain$Fare))
TitanicTrain[Rows3,"Fare"]<-14.45

###########Converting the types of Variables to correct form
TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
TitanicTrain$Title<-as.factor(TitanicTrain$Title)

#########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
TitanicTrain$Ticket<-NULL
TitanicTrain$Cabin<-NULL
TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles


########Converting the prepared data to a Data Frame
TitanicTrain<-data.frame(TitanicTrain)

#######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
Train<-subset(TitanicTrain,PassengerId<892)
Test<-subset(TitanicTrain,PassengerId>=892)

#############Separating the Independent (x) and the dependant ("Survived") variables
y<-as.numeric(Train[,2])-1
x<-data.frame(Train[,3:10])
Test$Survived<-NULL
xtest<-data.frame(Test)
xtest$PassengerId<-NULL

########Finally checking Structures for all the created data frames
str(y)
str(x)
str(xtest)


###########Loading the SuperLearner Library
library(SuperLearner)

#############Training the Model using SuperLearner Library (Ensemble Modelling)
single.model2 <- SuperLearner(y,
                             x,
                             family=binomial(),
                             SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
                                             "SL.glmnet","SL.randomForest"))

###########Printing the Model to check the Risk Estimates/Error
print(single.model2)


##########Making Predictions on the Test Data
predictions3<-predict.SuperLearner(single.model2,xtest)
###########Observing the frequency distribution of the Predictions
hist(predictions3$pred)

#########Converting the Predictions to Binaries
predictions4<-ifelse(predictions3$pred>=0.73,1,0)


#############Creating a CV File with the Predictions
write.csv(predictions4,"Predictions.csv")