Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # AS NOVAS FEATURES...
- TREINO <- features_treino
- VALID <- features_validacao
- TESTE <- features_teste
- ####TRANSFORMACOES PARA DECISION TREE...
- # TREINO
- TREINO <- TREINO[,-c(1)] # retiramos cartao
- TREINO <- TREINO[,-c(3)] # retiramos district
- # VALID
- VALID <- VALID[,-c(1)] # retiramos cartao
- VALID <- VALID[,-c(3)] # retiramos district
- ####TRANSFORMACOES PARA RANDOM FOREST...
- # TREINO
- f_TREINO <- TREINO
- # ja retiramos cartao e districs aqui!!!
- f_TREINO$Comprou <- as.factor(f_TREINO$Comprou) # transformar a label em factor para dar como input na randomForest(classificação)
- f_TREINO[,3] <- ifelse(f_TREINO[,3] == "NORTE ", 1, ifelse(f_TREINO[,3] == "SUL ", 2, ifelse(f_TREINO[,3] == "CENTRO ", 3, ifelse(f_TREINO[,3] == "MADEIRA", 4, ifelse(f_TREINO[,3] == "AÇORES ", 5,6)))))
- f_TREINO[,4] <- ifelse(f_TREINO[,4] == "F", 1, ifelse(f_TREINO[,4] == "M", 2, ifelse(f_TREINO[,4] == "A", 3, 4)))
- # VALID
- f_VALID <- VALID
- # ja retiramos cartao e districs aqui!!!
- # nao queremos transformar a Label de validacao para ja
- f_VALID[,3] <- ifelse(f_VALID[,3] == "NORTE ", 1, ifelse(f_VALID[,3] == "SUL ", 2, ifelse(f_VALID[,3] == "CENTRO ", 3, ifelse(f_VALID[,3] == "MADEIRA", 4, ifelse(f_VALID[,3] == "AÇORES ", 5,6)))))
- f_VALID[,4] <- ifelse(f_VALID[,4] == "F", 1, ifelse(f_VALID[,4] == "M", 2, ifelse(f_VALID[,4] == "A", 3, 4)))
- # --------------------------------------------------------------------------------------------------------------------------#
- # FORM
- features <- c('age', 'region', 'genero', 'total_compras3', 'total_AL3', 'total_semana3', 'media_semana3', 'total_FDS3', 'media_FDS3', 'total_online3', 'total_AUT3','total_bazar3', 'total_bio3', 'total_desporto3', 'total_lar3', 'total_pets3', 'total_restau3', 'total_roupa3', 'total_saude3', 'total_transp3', 'total_viagens3', 'total_compras6', 'total_AL6', 'total_semana6', 'media_semana6', 'total_FDS6', 'media_FDS6', 'total_online6', 'qte_online6', 'total_AUT6', 'total_bazar6', 'total_bio6', 'total_desporto6', 'total_lar6', 'total_pets6', 'total_restau6', 'total_roupa6', 'total_saude6', 'total_transp6', 'total_viagens6','total_compras12', 'total_AL12', 'total_semana12', 'media_semana12', 'total_FDS12', 'media_FDS12', 'total_online12', 'qte_online12', 'total_AUT12', 'total_bazar12', 'total_bio12', 'total_desporto12', 'total_lar12', 'total_pets12', 'total_restau12', 'total_roupa12', 'total_saude12', 'total_transp12', 'total_viagens12')
- label <- c('Comprou')
- form <- as.formula(paste(paste(label, " ~ "), paste(features, collapse = "+")))
- # --->PRECISION E RECALL
- metrics <- function(cm) {
- tp <- cm['table']$table[4]
- tn <- cm['table']$table[1]
- fp <- cm['table']$table[2]
- fn <- cm['table']$table[3]
- precision <- tp / (tp + fp)
- recall <- tp / (tp + fn)
- values <- c(precision, recall)
- return(values)
- }
- # --------------------------------------------------------------------------------------------------------------------------#
- # ---------- MODELOS --------- #
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------------------------- #
- # ------ DECISION TREE ------ #
- # --------------------------- #
- # -------> decision tree modelo Default
- tree_model_0 <- rpart(form, TREINO)
- rpart.plot(tree_model_0)
- # ---> ROC
- preds_tree_model_0 <- predict(tree_model_0, VALID)
- threshold_preds <- (preds_tree_model_0 > 0.5) * 1
- roc_curve_tree_0 <- roc.curve(scores.class0 = threshold_preds, weights.class0 = VALID[,label], curve=TRUE)
- plot(roc_curve_tree_0)
- # ---> matriz de confusao
- cm_tm_0 <- confusionMatrix(as.factor(threshold_preds), as.factor(VALID[,label]))
- # ---> precision e recall
- metrics(cm_tm_0)
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------> tree_model 1
- tree_model_1 <- rpart(form, TREINO, control = rpart.control(minsplit = 30, cp = 0.015, maxdepth = 30))
- rpart.plot(tree_model_1)
- # ---> ROC
- preds_tree_model_1 <- predict(tree_model_1, VALID)
- threshold_preds_1 <- (preds_tree_model_1 > 0.2) * 1
- roc_curve_tree_1 <- roc.curve(scores.class0 = preds_tree_model_1, weights.class0 = VALID[,label], curve=TRUE)
- plot(roc_curve_tree_1)
- # ---> matriz de confusao
- cm_tm_1 <- confusionMatrix(as.factor(threshold_preds_1), as.factor(VALID[,label]))
- # ---> precision e recall
- metrics(cm_tm_1)
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------------------------------------------------------------------------------------------------------------------------#
- # ------ RANDOM FOREST ------ #
- # --------------------------- #
- # --------> randomForest modelo Default
- rf_model_0 <- randomForest(form, f_TREINO, type="prob")
- print(rf_model_0)
- # ---> ROC
- preds_rf_model_0 <- predict(rf_model_0, f_VALID, type="prob")
- preds_rf_model_0 <- preds_rf_model_0[,-c(1)] #esta a dar-nos prob da class 0 e prob da class 1, vamos retirar a variavel 0
- threshold_preds_rf_0 <- (preds_rf_model_0 > 0.5) * 1
- roc_curve_rf_0 <- roc.curve(scores.class0 = threshold_preds_rf_0, weights.class0 = f_VALID[,label], curve=TRUE)
- plot(roc_curve_rf_0)
- # ---> matriz de confusao
- cm_rf_0 <- confusionMatrix(as.factor(threshold_preds_rf_0), as.factor(f_VALID[,label]))
- # ---> precision e recall
- metrics(cm_rf_0)
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------> randomForest_1
- rf_model_1 <- randomForest(form, f_TREINO, type="prob", ntree=30000, mtry=4) #AUC=0.670 Pre=0.675 Recall=0.432
- print(rf_model_1)
- # ---> ROC
- preds_rf_model_1 <- predict(rf_model_1, f_VALID, type="prob")
- preds_rf_model_1 <- preds_rf_model_1[,-c(1)] #esta a dar-nos prob da class 0 e prob da class 1, vamos retirar a variavel 0
- threshold_preds_rf_1 <- (preds_rf_model_1 > 0.5) * 1
- roc_curve_rf_1 <- roc.curve(scores.class0 = threshold_preds_rf_1, weights.class0 = f_VALID[,label], curve=TRUE)
- plot(roc_curve_rf_1)
- # ---> matriz de confusao
- cm_rf_1 <- confusionMatrix(as.factor(threshold_preds_rf_1), as.factor(f_VALID[,label]))
- # ---> precision e recall
- metrics(cm_rf_1)
- # --------> randomForest_DMwR2
- potato <- rpartXse(form, TREINO)
- prp(potato,type=4,extra=101)
- potato_preds <- predict(potato,VALID)
- potato_mae <- mean(abs(potato_preds-VALID[,label]))
- potato_mae
- cor_potato <- cor(potato_preds,VALID[,label])
- cor_potato
- potato_res <- performanceEstimation(
- PredTask(form, TREINO),
- Workflow(
- "standardWF",
- learner="rpartXse",
- learner.pars=list(se=c(0,0.5,1)),
- predictor.pars=list(type="class")
- ),
- EstimationTask(metrics="err",method=CV(nReps=3,nFolds=10)))
- summary(potato_res)
- plot(potato_res)
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------------------------------------------------------------------------------------------------------------------------#
- # ----------- SVM (C-classification) ----------- #
- # ---------------------------------------------- #
- # --------> SVM modelo baseline # RADIAL # ----------- #
- svm_model_R0 <- svm(form, TREINO, type="C-classification", kernel="radial")
- print(svm_model_R0)
- # ---> ROC
- preds_svm_model_R0 <- predict(svm_model_R0, VALID)
- roc_curve_svm_R0 <- roc.curve(scores.class0 = preds_svm_model_R0, weights.class0 = VALID[,label], curve=TRUE)
- plot(roc_curve_svm_R0)
- # ---> matriz de confusao
- cm_svm_R0 <- confusionMatrix(as.factor(preds_svm_model_R0), as.factor(VALID[,label]))
- # ---> precision e recall
- metrics(cm_svm_R0)
- # --------------------------------------------------------------------------------------------------------------------------#
- # ---> SVM 1 (radial)
- svm_model_R1 <- svm(form, TREINO, type="C-classification", kernel="radial", cost=50, gamma=1)
- print(svm_model_R1)
- # ---> ROC
- preds_svm_model_R1 <- predict(svm_model_R1, VALID)
- roc_curve_svm_R1 <- roc.curve(scores.class0 = preds_svm_model_R1, weights.class0 = VALID[,label], curve=TRUE)
- plot(roc_curve_svm_R1)
- # ---> matriz de confusao
- cm_svm_R1 <- confusionMatrix(as.factor(preds_svm_model_R1), as.factor(VALID[,label]))
- # ---> precision e recall
- metrics(cm_svm_R1)
- # --------------------------------------------------------------------------------------------------------------------------#
- # --------------------------------------------------------------------------------------------------------------------------#
- # ------------------------------------------------> END <-------------------------------------------------------------------#
- # --------------------------------------------------------------------------------------------------------------------------#
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement