Untitled

library(randomForest)
library(caret)
library(ggplot2)

data <- read.csv("http://pastebin.com/raw.php?i=mE5JL1dm")

data_pred <- data[, 1:(ncol(data) - 1)]
data_resp <- as.factor(data$y)

data_trans <- preProcess(data_pred, method = c("center", "scale"))
data_pred_scale <- predict(data_trans, data_pred)

trControl <- trainControl(method = "LGOCV", p = 0.9, savePredictions = T)

set.seed(123)
model <- train(x = data_pred_scale, y = data_resp,
               method = "rf", scale = F,
               trControl = trControl)

> model

Random Forest

516 samples
 11 predictors
  5 classes: '0', '0.5', '1', '1.5', '2'

No pre-processing
Resampling: Repeated Train/Test Splits Estimated (25 reps, 0.9%)

Summary of sample sizes: 468, 468, 468, 468, 468, 468, ...

Resampling results across tuning parameters:

  mtry  Accuracy  Kappa  Accuracy SD  Kappa SD
  2     0.747     0.663  0.0643       0.0853
  6     0.76      0.68   0.0507       0.068
  11    0.758     0.678  0.0574       0.0763

Accuracy was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 6.

# data set of model predictions on training data vs. actual observations
results <- data.frame(pred = predict(model, data_pred_scale),
                      obs = data_resp)

table(results)

     obs
pred    0 0.5   1 1.5   2
  0   148   0   0   0   0
  0.5   0 132   0   0   0
  1     0   0 139   0   0
  1.5   0   0   0  38   0
  2     0   0   0   0  59

model_resamples <- model$pred[model$pred$mtry == 6, c("pred", "obs")

table(model_resamples)

        0 0.5   1 1.5   2
  0   296  69   5   0   0
  0.5  51 228  48   0   0
  1     3  28 255  24   9
  1.5   0   0  16  32  15
  2     0   0   1  19 101