Untitled

library(foreign)
library(boot)

datafile <- read.dta("~/Downloads/peace.dta")
vars = c("wartype", "logdead", "wardur",  "factnum", "factnum2", "trnsfcap", "develop",
             "exp", "decade","treaty","un2int","pbs2s3")
df = datafile[vars]
df = na.omit(df)
attach(df)

#un2int: Was there a UN peace operation? 0 = no; 1 = yes.
#pbs2s3: strict PB success

##################
set.seed(12345)
#Randomly split the data into training and test set
index <- sample(c(1:122), 122, replace = F)
train <- index[1:61]
test <- index[62:122]
train_set <- df[train,]
test_set <- df[test,]

#Generate the logistic regression model
log_reg <- glm (pbs2s3 ~ wartype + logdead + wardur + factnum + factnum2 + trnsfcap +
                develop + exp + decade + treaty + un2int + I(wardur * un2int), family = binomial, data = train_set)
#+ I(wardur * un2int)
summary(log_reg)
#plot(log_reg, pch=20)

#Use this logistic model to predict the test set
predicted.prob <- predict(log_reg, test_set, type = "response")
predicted=rep("Fail",61)
predicted[which(predicted.prob >= 0.5)] = "Success"

#Code the results of test set for comparison
test = rep("Fail",61)
test[which(test_set$pbs2s3 == 1)] = "Success"

#Generate the confusion matrix to calculate the accuracy rate or predicting success
table(predicted,test)
#how many percent of the results in test set is correctly predicted
mean(predicted == test)

#K-Fold cross validation
#create a vector that stores the result errors
cv.error.10=rep(0,10)
for (i in 1:10){
  glm.fit=glm (pbs2s3 ~ wartype + logdead + wardur + factnum + factnum2 + trnsfcap +
              develop + exp + decade + treaty + un2int + I(wardur * un2int), family = binomial, data = df)
  cv.error.10[i]=cv.glm(df, glm.fit, K=10)$delta[1]}
#the cross validation errors
print(cv.error.10)
#average test misclassification rate
mean(cv.error.10)