Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- trainData <- read.csv("C:/Users/TiagoMonteiro/Downloads/CDados/CDados/dataset1/aps_failure_training_set.csv", na.strings="na")
- testData <- read.csv("C:/Users/TiagoMonteiro/Downloads/CDados/CDados/dataset1/aps_failure_test_set.csv", na.strings="na")
- qplot(as.factor(trainData$class), xlab = "class")
- qplot(as.factor(testData$class), xlab = "class")
- trainData <- trainData[, -which(colMeans(is.na(trainData)) > 0.7)]
- testData <- testData[, -which(colMeans(is.na(testData)) > 0.7)]
- #impute test data
- preProcValues <- preProcess(testData, method = c("knnImpute"), k = 3)
- # use it to predict all the missing values
- testData_knnimp <- predict(preProcValues, testData)
- set.seed(1234)
- #clustering
- cl <- makeCluster(8, type="SOCK")
- registerDoSNOW(cl)
- #omit
- trainData_omit <- na.omit(trainData);
- #knn impute of train
- trainData_imputation <- preProcess(trainData, method = c("knnImpute"), k = 3)
- # use it to predict all the missing values
- trainData_imputation <- predict(trainData_imputation, trainData)
- #mean
- trainData_mean <- mice(trainData, m=1, maxit = 5, method = "mean", seed = 500)
- testData_mean <- mice(testData, m=1, maxit = 5, method = "mean", seed = 500)
- #mean s/ constants linears
- attrToDelete <- subset(trainData_mean$loggedEvents, meth == "collinear" | meth == "constant" )
- attrToDeleteNames <- as.character (attrToDelete [, "out"])
- trainData_mean <- complete(trainData_mean, 1)
- trainData_mean_clean <- trainData_mean[, !(names(trainData_mean) %in% attrToDeleteNames)]
- attrToDelete <- subset(testData_mean$loggedEvents, meth == "collinear" | meth == "constant" )
- attrToDeleteNames <- as.character (attrToDelete [, "out"])
- testData_mean <- complete(testData_mean, 1)
- testData_mean_clean <- testData_mean[, !(names(testData_mean) %in% attrToDeleteNames)]
- #feature selection
- trainData_mean_clean_feature_selection <- trainData_mean_clean
- trainData_mean_clean_feature_selection <- gain.ratio( class~., trainData_mean_clean_feature_selection)
- trainData_mean_clean_feature_selection <- t(trainData_mean_clean_feature_selection)
- trainData_mean_clean_feature_selection <- trainData_mean_clean_feature_selection[which(!colSums(trainData_mean_clean_feature_selection) == 0)]
- #KNN
- grid <- expand.grid(k = c((seq(3, 17, by=2))))
- testData1$as_000 <- NULL
- trainData_omit$cd_000 <- NULL
- results.model_knn_omit_pp_norm = train(trainData_omit[-1], trainData_omit$class, method="knn", tuneGrid=grid, preProcess=c("center", "scale"))
- results.model_knn_omit_pp_norm_pca = train(trainData_omit[-1], trainData_omit$class, method="knn", tuneGrid=grid, preProcess=c("center", "scale", "pca"))
- results.model_knn_clean_pp_norm = train(trainData_mean_clean[-1], trainData_mean_clean$class, method="knn", tuneGrid=grid, preProcess=c("center", "scale"))
- results.model_knn_clean_pp_norm_pca = train(trainData_mean_clean[-1], trainData_mean_clean$class, method="knn", tuneGrid=grid, preProcess=c("center", "scale", "pca"))
- #NB
- grid <- data.frame(fL=c(0.5,1.0), usekernel = TRUE, adjust=c(0.5,1.0))
- results.model_nb_omit_pp_norm = train(trainData_omit[-1], trainData_omit$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale"))
- results.model_nb_omit_pp_norm_pca = train(trainData_omit[-1], trainData_omit$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale", "pca"))
- results.model_nb_mean_pp_norm = train(trainData_mean[-1], trainData_mean$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale"))
- results.model_nb_mean_pp_norm_pca = train(trainData_mean[-1], trainData_mean$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale", "pca"))
- results.model_nb_clean_pp_norm = train(trainData_mean_clean[-1], trainData_mean_clean$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale"))
- results.model_nb_clean_pp_norm_pca = train(trainData_mean_clean[-1], trainData_mean_clean$class, method="nb", tuneGrid=grid, preProcess=c("center", "scale", "pca"))
- model_nb <- train(class ~ .,data = trainData, method = "nb",tuneList = NULL, preProcess = c("center", "scale"))
- preds <- predict(model_nb, testData)
- confusionMatrix(preds, testData$class)
- #KNN
- grid <- expand.grid(k = c((seq(3, 17, by=2))))
- model_cv <- train(class ~ ., data = trainData, method="knn", tuneGrid=grid, preProcess = c("pca"))
- #J48
- library("RWeka")
- truck_j48 = J48(trainData$class ~ ., data = trainData)
- plot(truck_j48)
- eval_j48 <- evaluate_Weka_classifier(truck_j48, numFolds = 10, complexity = FALSE,
- seed = 1, class = TRUE)
- eval_j48
- preds <- predict(truck_j48, newdata = testData)
- confusionMatrix(preds, testData$class)
- #randomForest
- library("randomForest")
- set.seed(1234)
- random_forest <- randomForest(trainData$class~., data=trainData, controls=cforest_unbiased(ntree=2000, mtry=3))
- preds <- predict(random_forest, newdata = testData)
- confusionMatrix(preds, testData$class)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement