Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- fit <- lm(Gross ~ Year + Runtime + Budget, data = movie)
- predicted = predict(fit, movie)
- rerror = rmse(predicted, movie$Gross)
- rmse = function(predicted_vector, actual_vector) {
- difference = predicted_vector - actual_vector
- rmse = sqrt(mean(difference ^ 2))
- return(rmse)
- }
- rmse_frame = data.frame(size = numeric(20), train_rmse = numeric(20), test_rmse = numeric(20))
- size_vector = seq(from = 5, to = 100, by = 5)
- for(c in 1:20) {
- size = size_vector[c]
- total_rows_training = dim(training_set)[1]
- number_of_rows_to_select = ceiling(((size/100) * total_rows_training))
- train_rmse_vector = c()
- test_rmse_vector = c()
- for(i in 1:10) { # RUN 10 TIMES, THEN TAKE AVERAGE OF RMSE
- # SHUFFLE THE TRAINING SET
- shuffle_training = training_set[sample(nrow(training_set)),]
- # SELECT THE ROWS FOR TRAINING SET ACCORDING TO THE SAMPLE SIZE
- sample_train_set = shuffle_training[1:number_of_rows_to_select,]
- fit <- lm(Gross ~ Year + Runtime + Budget + imdbVotesTransformed , data = sample_train_set)
- train_prediction = predict(fit, sample_train_set)
- train_rmse = rmse(train_prediction, sample_train_set$Gross)
- train_rmse_vector = c(train_rmse_vector, train_rmse)
- # USE MODEL GENERATED ON THE TEST_SET
- test_prediction = predict(fit, test_set)
- test_rmse = rmse(test_prediction, test_set$Gross)
- test_rmse_vector = c(test_rmse_vector, test_rmse)
- }
- rmse_frame$size[c] = size
- # TAKE AVERAGE OF RMSE
- rmse_frame$train_rmse[c] = mean(train_rmse_vector)
- rmse_frame$test_rmse[c] = mean(test_rmse_vector)
- }
Add Comment
Please, Sign In to add comment