Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- clear; clc;
- format compact
- format longG
- %% Reading the data
- fid = fopen ('train.csv');
- headers = strsplit (fgets(fid), ','); %getting the headers
- raw_data = textscan(fid, '%s%s%s%s%s%s%s%s%s%s%s%s', 'Delimiter', ',');
- fclose(fid);
- clear ans fid
- id_num = [6:12];
- id_cell = [2:5];
- headers_num = headers(id_num);
- headers_cell = headers(id_cell);
- matrix_cell = raw_data(id_cell);
- %% Solving numerical variables and datetime:
- for i = 1:length(id_num)
- matrix_num(:,i) = str2double(raw_data{id_num(i)});
- end
- clear i headers row_data id_cell id_num
- %calculating the percentage of missing value in numerical variables
- %missval = sum(isnan(matrix_num),1)/length(matrix_num);
- %showing missing value in matrices by horizontal graph
- %x = categorical (headers_num)
- %y = sum(isnan(matrix_num),1)
- %figure;
- %graph_missval = barh(x,y)
- %The data is quite good because there is not missing value;
- %convert the first column from cell to datetime:
- matdata = raw_data{1}
- data_dt = datetime(matdata,'InputFormat','yyyy-MM-dd HH:mm:ss');
- %creating dummy variables:
- dummy_heads=[];
- dummy_vars=[];
- for i=1:length(matrix_cell)
- dummy_heads = [dummy_heads; strcat(headers_cell{i},'_',unique(matrix_cell{:,i}))];
- dummy_vars = [dummy_vars dummyvar(nominal(matrix_cell{:,i}))];
- end
- %creating combination of matrix arrays with both numerical and categorical
- %variables:
- var_x = [matrix_num(:,1:7) dummy_vars]
- var_x_header = [headers_num(:,1:7) dummy_heads']
- %% Descriptive Statistics:
- %Number of Rentals monthly:
- %create logical matrix for data by hourly and seasonly:
- for i = 1:length(unique(data_dt.Hour))
- matrix_hour (:,i) = double(int8(data_dt.Hour == i-1))
- end
- %create logical matrix for data by date:
- for j = 1:length(unique(data_dt.Day))
- matrix_day (:,j) = double(int8(data_dt.Day == j))
- end
- %create logical matrix for data by month:
- for h = 2011:length(unique(data_dt.Month))
- matrix_month (:,h) = double(int8(data_dt.Month == h))
- end
- %create logical matrix for data by year:
- for k = 2011:[length(unique(data_dt.Year))+2010]
- matrix_Year = double(int8(data_dt.Year == k))
- end
- hour_headers = ["0h","1h","2h","3h","4h","5h","6h","7h","8h","9h","10h","11h","12h","13h","14h","15h","16h","17h","18h","19h","20h","21h","22h","23h"];
- alldata_by_hour = [var_x, matrix_hour];
- alldata_by_hour_headers = [var_x_header,hour_headers]
- rentals_num = var_x(:,7)
- headers_day = ["day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14","day15","day16","day17","day18","day19"]
- %hourly rentals in the seasons:
- %spring:
- alldata_by_spring = alldata_by_hour(var_x(:,8)>0,1:43)
- %summer:
- alldata_by_summer = alldata_by_hour(var_x(:,9)>0,1:43)
- %autumn:
- alldata_by_autumn = alldata_by_hour(var_x(:,10)>0,1:43)
- %winter:
- alldata_by_winter = alldata_by_hour(var_x(:,11)>0,1:43)
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_spring (:,i) = mean(alldata_by_spring(alldata_by_spring(:,i+19)>0,7))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_summer (:,i) = mean(alldata_by_summer(alldata_by_summer(:,i+19)>0,7))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_autumn (:,i) = mean(alldata_by_autumn(alldata_by_autumn(:,i+19)>0,7))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_winter (:,i) = mean(alldata_by_winter(alldata_by_winter(:,i+19)>0,7))
- end
- mean_hourly_rentals_byseason = [mean_hourly_rentals_spring',mean_hourly_rentals_summer',mean_hourly_rentals_autumn',mean_hourly_rentals_winter']
- hour = [0:23]'
- figure;
- plot(hour,mean_hourly_rentals_byseason(:,1),hour,mean_hourly_rentals_byseason(:,2),hour,mean_hourly_rentals_byseason(:,3),hour,mean_hourly_rentals_byseason(:,4),'LineWidth',1.5)
- xticks = [0:1:23];
- xlim = [0 23];
- xlabel ('daily by hour')
- ylabel ('number of rentals')
- legend ('spring','summer','autumn','winter','location','northwest')
- %Conclusion from graph: people tend to rent more bikes in the autumn, and
- %less in the spring. In 24 hour, the peak for renting in afternoon till
- %evening, the lowest level is in night.
- %Heatmap:
- figure;
- Corrnum_var = corrcoef(matrix_num)
- xvalue = (headers_num)
- yvalue = (headers_num)
- heatmap(xvalue,yvalue,Corrnum_var)
- %Graph show rentals in workinday and weekend:
- count_workday = alldata_by_hour(alldata_by_hour(:,14)<1,:)
- count_weekend = alldata_by_hour(alldata_by_hour(:,14)>0,:)
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_workday(:,i) = mean(count_workday(count_workday(:,i+19)>0,7))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_weekend(:,i) = mean(count_weekend(count_weekend(:,i+19)>0,7))
- end
- figure;
- plot(hour,mean_hourly_rentals_workday,hour,mean_hourly_rentals_weekend)
- xticks = [0:1:23];
- xlim = [0 23];
- xlabel ('daily by hour')
- ylabel ('number of rentals')
- legend ('Weekday','Weekend','location','northwest')
- %Graphing casual vs holiday and working daay by hour:
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_casual_workday(:,i) = mean(count_workday(count_workday(:,i+19)>0,5))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_casual_weekend(:,i) = mean(count_weekend(count_weekend(:,i+19)>0,5))
- end
- figure;
- plot(hour,mean_hourly_casual_workday,hour,mean_hourly_casual_weekend)
- xticks = [0:1:23];
- xlim = [0 23];
- xlabel ('daily by hour')
- ylabel ('number of rentals')
- legend ('Weekday','Weekend','location','northwest')
- count_holiday = alldata_by_hour(alldata_by_hour(:,13)>0,:)
- count_not_holiday = alldata_by_hour(alldata_by_hour(:,13)<1,:)
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_holiday(:,i) = mean(count_holiday(count_holiday(:,i+19)>0,7))
- end
- for i = 1:length(unique(data_dt.Hour))
- mean_hourly_rentals_notholiday(:,i) = mean(count_not_holiday(count_holiday(:,i+19)>0,7))
- end
- figure;
- plot(hour,mean_hourly_rentals_holiday,hour,mean_hourly_rentals_notholiday)
- xticks = [0:1:23];
- xlim = [0 23];
- xlabel ('daily by hour')
- ylabel ('number of rentals')
- legend ('holiday','not_holiday','location','northwest')
- %% Linear Regression and Lasso:
- %Linear regression with all numerical and dummy variables:
- traindata_border = length(var_x)*0.7; %We divide train data into 70-30 parts
- var_x_all = alldata_by_hour(:,[1:4,8:43])
- var_x_all_header = alldata_by_hour_headers(:,[1:4,8:43])
- var_x_all_train = alldata_by_hour(1:traindata_border,[1:4,8:43])
- b_train = fitlm(var_x_all_train, var_x(1:traindata_border,7), 'Intercept', true, 'PredictorVars',alldata_by_hour_headers(:,[1:4,8:43]), 'RobustOpt', 'off');
- wape_b_train = sum(abs(var_x(1:traindata_border,7) - b_train.Fitted))/sum(var_x(1:7620,7))
- %Conclusion: Result of model is not good, because from data we see there
- %appear more categorical variables than numerical variables.
- %We propose to add interaction terms for improving accuracy of linear
- %model.
- %Adding interaction terms:
- %Interaction term: actual temperature * season
- matrix_num_inter = [var_x_all repmat(var_x_all(:,2),1,4).*dummy_vars(:,1:4)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [var_x_all_header strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_atemp')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: windspeed * weather
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,4),1,4).*dummy_vars(:,9:12)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_windspeed')'];
- %interaction term: humidity * season
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,3),1,4).*dummy_vars(:,1:4)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'season_',''),'_humidity')'];
- %interaction term: working_day * weather
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,4).*dummy_vars(:,9:12)];
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'weather_',''),'_workingday0')'];
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,4).*dummy_vars(:,9:12)];
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'weather_',''),'_workingday1')'];
- %Interaction term: winspeed * hour
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,4),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_wind')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: humidity * hour
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,3),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_humid')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: hour * atemp
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,2),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_atemp')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: hour * weather
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,13),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,16),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: humidity * atemp
- %matrix_num_inter = [matrix_num_inter matrix_num(:,2).*matrix_num(:,3)];
- %headers_num_inter = [headers_num_inter 'Atemp_Humidity'];
- %Interaction term: season * weather
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,13),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,16),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: hour * season
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,5),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,6),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,7),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,8),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: hour * workingday:
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_workingday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_workingday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: hour * holiday
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,9),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_holiday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,10),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_holiday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: workingday * weather
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_Workingday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_Workingday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction term: holiday * weather
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,9),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_notholiday')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,10),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_holiday')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %Interaction terms: casual * weekend
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,1).*alldata_by_hour(:,5)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(7,1),'Workingday_',''),'_casual')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,1).*alldata_by_hour(:,5)]; % repmat is needed to balance inner dimensions of a vector and a matrix
- %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(8,1),'Workingday_',''),'_casual')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
- b2_train = fitlm(matrix_num_inter(1:traindata_border,:), var_x(1:traindata_border,7), 'Intercept', true, 'PredictorVars',headers_num_inter, 'RobustOpt', 'off');
- wape_b2_train = sum(abs(var_x(1:traindata_border,7) - b2_train.Fitted))/sum(var_x(1:traindata_border,7))
- %Test model:
- b2_train_predicted = predict(b2_train,matrix_num_inter(traindata_border+1:end,:));
- wape_b2_train_predicted = sum(abs(var_x(traindata_border+1:end,7) - b2_train_predicted))/sum(var_x(traindata_border+1:end,7));
- %Split data by day:
- headers_day = ["day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14","day15","day16","day17","day18","day19"]
- data_by_day = [matrix_num_inter matrix_day var_x(:,7)]
- data_day_header = [headers_num_inter headers_day "count"]
- data_day1 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+1)>0),1:end)
- data_day2 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+2)>0),1:end)
- data_day3 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+3)>0),1:end)
- data_day4 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+4)>0),1:end)
- data_day5 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+5)>0),1:end)
- data_day6 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+6)>0),1:end)
- data_day7 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+7)>0),1:end)
- data_day8 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+8)>0),1:end)
- data_day9 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+9)>0),1:end)
- data_day10 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+10)>0),1:end)
- data_day11 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+11)>0),1:end)
- data_day12 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+12)>0),1:end)
- data_day13 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+13)>0),1:end)
- data_day14 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+14)>0),1:end)
- data_day15 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+15)>0),1:end)
- data_day16 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+16)>0),1:end)
- data_day17 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+17)>0),1:end)
- data_day18 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+18)>0),1:end)
- data_day19 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+19)>0),1:end)
- train_data = [data_day1;data_day2;data_day3;data_day4;data_day5;data_day6;data_day7;data_day8;data_day9;data_day10;data_day11;data_day12;data_day13;data_day14];
- test_data = [data_day15;data_day16;data_day17;data_day18;data_day19];
- %Running Lasso:
- %[train_b2,train_fit2]=lasso(matrix_num_inter,var_x(:,7),'CV',10,'PredictorNames',headers_num_inter);
- %lassoPlot(train_b2,train_fit2,'PlotType','CV')
- %save('lasso.mat','train_b2','train_fit2')
- load lasso
- minMSE_b2 = train_fit2.PredictorNames(train_b2(:,train_fit2.IndexMinMSE)~=0);
- sparse_b2 = train_fit2.PredictorNames(train_b2(:,train_fit2.Index1SE)~=0);
- %Running lasso with interaction terms:
- b2_lasso = fitlm(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'Intercept',true, 'PredictorVars',minMSE_b2);
- wape_b2_lasso = sum(abs(train_data(:,448) - b2_lasso.Fitted))/sum(train_data(:,448));
- %Running lasso for test data:
- y_pred_regr_lasso = predict(b2_lasso,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')));
- wape_pred_regr_lasso = sum(abs(test_data(:,448) - y_pred_regr_lasso))/sum(test_data(:,448));
- %% CART Model:
- %leafs = linspace(1,100,100);
- %rng('default')
- %N = numel(leafs);
- %err = zeros(N,1);
- %for n=1:N
- % t = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'CrossVal','On','MinLeaf',leafs(n));
- % err(n) = kfoldLoss(t);
- %end
- %figure;
- %plot(leafs,err,'LineWidth',1.5,'Color','r');
- %grid on
- %title('Defining of optimal minleaf')
- %xlabel('Min Leaf Size');
- %ylabel('cross-validated error');
- %dtreefull = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'PredictorNames',minMSE_b2,'minleaf',round(leafs(find(err==min(err))')));
- %y_pred_tree = predict(dtreefull,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')));
- %wape_pred_tree = sum(abs(test_data(:,448) - y_pred_tree))/sum(test_data(:,448))
- % Resubstitution error is the difference between the response training data
- % and the predictions the tree makes of the response based on the input training data.
- % The resubstitution loss for a regression tree is the mean-squared error.
- % It indicates that a typical predictive error for the tree is about the square root of it.
- %resuberror_train = sqrt(resubLoss(dtreefull))
- %view(dtreefull,'Mode','Graph');
- %wape_tree_in = zeros(1,numel(leafs));
- %wape_tree_out = zeros(1,numel(leafs));
- %for i = 1:numel(leafs)
- % plottree = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'PredictorNames',minMSE_b2,'minleaf',leafs(i));
- % wape_tree_in(1,i) = sum(abs(train_data(:,448) - predict(plottree,train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')))))/sum(train_data(:,448));
- % wape_tree_out(1,i) = sum(abs(test_data(:,448) - predict(plottree,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')))))/sum(test_data(:,448));
- %end
- %figure;
- %plot(leafs,wape_tree_in,'LineWidth',1.75,'Color','r');
- %hold on
- %plot(leafs,wape_tree_out,'LineWidth',1.75,'Color','b');
- %grid on
- %title('Realization of WAPE in DT w.r.t. minleafsize')
- %xlabel('Min Leaf Size');
- %ylabel('WAPE, %');
- %legend('In-sample','Out-of-sample','Location','southeast')
Add Comment
Please, Sign In to add comment