Guest User

Untitled

a guest
Jul 15th, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 22.48 KB | None | 0 0
  1. clear; clc;
  2. format compact
  3. format longG
  4. %% Reading the data
  5. fid = fopen ('train.csv');
  6. headers = strsplit (fgets(fid), ','); %getting the headers
  7. raw_data = textscan(fid, '%s%s%s%s%s%s%s%s%s%s%s%s', 'Delimiter', ',');
  8. fclose(fid);
  9. clear ans fid
  10.  
  11. id_num = [6:12];
  12. id_cell = [2:5];
  13. headers_num = headers(id_num);
  14. headers_cell = headers(id_cell);
  15. matrix_cell = raw_data(id_cell);
  16. %% Solving numerical variables and datetime:
  17. for i = 1:length(id_num)
  18. matrix_num(:,i) = str2double(raw_data{id_num(i)});
  19. end
  20. clear i headers row_data id_cell id_num
  21.  
  22. %calculating the percentage of missing value in numerical variables
  23. %missval = sum(isnan(matrix_num),1)/length(matrix_num);
  24.  
  25. %showing missing value in matrices by horizontal graph
  26. %x = categorical (headers_num)
  27. %y = sum(isnan(matrix_num),1)
  28. %figure;
  29. %graph_missval = barh(x,y)
  30.  
  31. %The data is quite good because there is not missing value;
  32.  
  33. %convert the first column from cell to datetime:
  34. matdata = raw_data{1}
  35. data_dt = datetime(matdata,'InputFormat','yyyy-MM-dd HH:mm:ss');
  36. %creating dummy variables:
  37. dummy_heads=[];
  38. dummy_vars=[];
  39. for i=1:length(matrix_cell)
  40. dummy_heads = [dummy_heads; strcat(headers_cell{i},'_',unique(matrix_cell{:,i}))];
  41. dummy_vars = [dummy_vars dummyvar(nominal(matrix_cell{:,i}))];
  42. end
  43. %creating combination of matrix arrays with both numerical and categorical
  44. %variables:
  45. var_x = [matrix_num(:,1:7) dummy_vars]
  46. var_x_header = [headers_num(:,1:7) dummy_heads']
  47. %% Descriptive Statistics:
  48. %Number of Rentals monthly:
  49. %create logical matrix for data by hourly and seasonly:
  50. for i = 1:length(unique(data_dt.Hour))
  51. matrix_hour (:,i) = double(int8(data_dt.Hour == i-1))
  52. end
  53. %create logical matrix for data by date:
  54. for j = 1:length(unique(data_dt.Day))
  55. matrix_day (:,j) = double(int8(data_dt.Day == j))
  56. end
  57.  
  58. %create logical matrix for data by month:
  59. for h = 2011:length(unique(data_dt.Month))
  60. matrix_month (:,h) = double(int8(data_dt.Month == h))
  61. end
  62.  
  63. %create logical matrix for data by year:
  64. for k = 2011:[length(unique(data_dt.Year))+2010]
  65. matrix_Year = double(int8(data_dt.Year == k))
  66. end
  67.  
  68. hour_headers = ["0h","1h","2h","3h","4h","5h","6h","7h","8h","9h","10h","11h","12h","13h","14h","15h","16h","17h","18h","19h","20h","21h","22h","23h"];
  69. alldata_by_hour = [var_x, matrix_hour];
  70. alldata_by_hour_headers = [var_x_header,hour_headers]
  71. rentals_num = var_x(:,7)
  72.  
  73. headers_day = ["day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14","day15","day16","day17","day18","day19"]
  74.  
  75. %hourly rentals in the seasons:
  76. %spring:
  77. alldata_by_spring = alldata_by_hour(var_x(:,8)>0,1:43)
  78. %summer:
  79. alldata_by_summer = alldata_by_hour(var_x(:,9)>0,1:43)
  80. %autumn:
  81. alldata_by_autumn = alldata_by_hour(var_x(:,10)>0,1:43)
  82. %winter:
  83. alldata_by_winter = alldata_by_hour(var_x(:,11)>0,1:43)
  84.  
  85. for i = 1:length(unique(data_dt.Hour))
  86. mean_hourly_rentals_spring (:,i) = mean(alldata_by_spring(alldata_by_spring(:,i+19)>0,7))
  87. end
  88.  
  89. for i = 1:length(unique(data_dt.Hour))
  90. mean_hourly_rentals_summer (:,i) = mean(alldata_by_summer(alldata_by_summer(:,i+19)>0,7))
  91. end
  92.  
  93. for i = 1:length(unique(data_dt.Hour))
  94. mean_hourly_rentals_autumn (:,i) = mean(alldata_by_autumn(alldata_by_autumn(:,i+19)>0,7))
  95. end
  96.  
  97. for i = 1:length(unique(data_dt.Hour))
  98. mean_hourly_rentals_winter (:,i) = mean(alldata_by_winter(alldata_by_winter(:,i+19)>0,7))
  99. end
  100.  
  101. mean_hourly_rentals_byseason = [mean_hourly_rentals_spring',mean_hourly_rentals_summer',mean_hourly_rentals_autumn',mean_hourly_rentals_winter']
  102. hour = [0:23]'
  103. figure;
  104. plot(hour,mean_hourly_rentals_byseason(:,1),hour,mean_hourly_rentals_byseason(:,2),hour,mean_hourly_rentals_byseason(:,3),hour,mean_hourly_rentals_byseason(:,4),'LineWidth',1.5)
  105. xticks = [0:1:23];
  106. xlim = [0 23];
  107. xlabel ('daily by hour')
  108. ylabel ('number of rentals')
  109. legend ('spring','summer','autumn','winter','location','northwest')
  110. %Conclusion from graph: people tend to rent more bikes in the autumn, and
  111. %less in the spring. In 24 hour, the peak for renting in afternoon till
  112. %evening, the lowest level is in night.
  113.  
  114. %Heatmap:
  115. figure;
  116. Corrnum_var = corrcoef(matrix_num)
  117. xvalue = (headers_num)
  118. yvalue = (headers_num)
  119. heatmap(xvalue,yvalue,Corrnum_var)
  120.  
  121. %Graph show rentals in workinday and weekend:
  122. count_workday = alldata_by_hour(alldata_by_hour(:,14)<1,:)
  123. count_weekend = alldata_by_hour(alldata_by_hour(:,14)>0,:)
  124. for i = 1:length(unique(data_dt.Hour))
  125. mean_hourly_rentals_workday(:,i) = mean(count_workday(count_workday(:,i+19)>0,7))
  126. end
  127.  
  128. for i = 1:length(unique(data_dt.Hour))
  129. mean_hourly_rentals_weekend(:,i) = mean(count_weekend(count_weekend(:,i+19)>0,7))
  130. end
  131. figure;
  132. plot(hour,mean_hourly_rentals_workday,hour,mean_hourly_rentals_weekend)
  133. xticks = [0:1:23];
  134. xlim = [0 23];
  135. xlabel ('daily by hour')
  136. ylabel ('number of rentals')
  137. legend ('Weekday','Weekend','location','northwest')
  138.  
  139. %Graphing casual vs holiday and working daay by hour:
  140. for i = 1:length(unique(data_dt.Hour))
  141. mean_hourly_casual_workday(:,i) = mean(count_workday(count_workday(:,i+19)>0,5))
  142. end
  143.  
  144. for i = 1:length(unique(data_dt.Hour))
  145. mean_hourly_casual_weekend(:,i) = mean(count_weekend(count_weekend(:,i+19)>0,5))
  146. end
  147. figure;
  148. plot(hour,mean_hourly_casual_workday,hour,mean_hourly_casual_weekend)
  149. xticks = [0:1:23];
  150. xlim = [0 23];
  151. xlabel ('daily by hour')
  152. ylabel ('number of rentals')
  153. legend ('Weekday','Weekend','location','northwest')
  154.  
  155. count_holiday = alldata_by_hour(alldata_by_hour(:,13)>0,:)
  156. count_not_holiday = alldata_by_hour(alldata_by_hour(:,13)<1,:)
  157. for i = 1:length(unique(data_dt.Hour))
  158. mean_hourly_rentals_holiday(:,i) = mean(count_holiday(count_holiday(:,i+19)>0,7))
  159. end
  160.  
  161. for i = 1:length(unique(data_dt.Hour))
  162. mean_hourly_rentals_notholiday(:,i) = mean(count_not_holiday(count_holiday(:,i+19)>0,7))
  163. end
  164.  
  165. figure;
  166. plot(hour,mean_hourly_rentals_holiday,hour,mean_hourly_rentals_notholiday)
  167. xticks = [0:1:23];
  168. xlim = [0 23];
  169. xlabel ('daily by hour')
  170. ylabel ('number of rentals')
  171. legend ('holiday','not_holiday','location','northwest')
  172. %% Linear Regression and Lasso:
  173. %Linear regression with all numerical and dummy variables:
  174. traindata_border = length(var_x)*0.7; %We divide train data into 70-30 parts
  175. var_x_all = alldata_by_hour(:,[1:4,8:43])
  176. var_x_all_header = alldata_by_hour_headers(:,[1:4,8:43])
  177. var_x_all_train = alldata_by_hour(1:traindata_border,[1:4,8:43])
  178. b_train = fitlm(var_x_all_train, var_x(1:traindata_border,7), 'Intercept', true, 'PredictorVars',alldata_by_hour_headers(:,[1:4,8:43]), 'RobustOpt', 'off');
  179. wape_b_train = sum(abs(var_x(1:traindata_border,7) - b_train.Fitted))/sum(var_x(1:7620,7))
  180.  
  181. %Conclusion: Result of model is not good, because from data we see there
  182. %appear more categorical variables than numerical variables.
  183. %We propose to add interaction terms for improving accuracy of linear
  184. %model.
  185.  
  186. %Adding interaction terms:
  187. %Interaction term: actual temperature * season
  188. matrix_num_inter = [var_x_all repmat(var_x_all(:,2),1,4).*dummy_vars(:,1:4)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  189. headers_num_inter = [var_x_all_header strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_atemp')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  190.  
  191. %Interaction term: windspeed * weather
  192. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,4),1,4).*dummy_vars(:,9:12)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  193. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_windspeed')'];
  194.  
  195. %interaction term: humidity * season
  196. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,3),1,4).*dummy_vars(:,1:4)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  197. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'season_',''),'_humidity')'];
  198.  
  199. %interaction term: working_day * weather
  200. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,4).*dummy_vars(:,9:12)];
  201. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'weather_',''),'_workingday0')'];
  202. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,4).*dummy_vars(:,9:12)];
  203. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'weather_',''),'_workingday1')'];
  204.  
  205. %Interaction term: winspeed * hour
  206. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,4),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  207. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_wind')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  208.  
  209. %Interaction term: humidity * hour
  210. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,3),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  211. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_humid')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  212.  
  213. %Interaction term: hour * atemp
  214. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,2),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  215. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_atemp')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  216.  
  217. %Interaction term: hour * weather
  218. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,13),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  219. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  220.  
  221. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  222. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  223.  
  224. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  225. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  226.  
  227. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,16),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  228. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_weather4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  229.  
  230. %Interaction term: humidity * atemp
  231. %matrix_num_inter = [matrix_num_inter matrix_num(:,2).*matrix_num(:,3)];
  232. %headers_num_inter = [headers_num_inter 'Atemp_Humidity'];
  233.  
  234. %Interaction term: season * weather
  235. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,13),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  236. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  237.  
  238. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,14),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  239. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  240.  
  241. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,15),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  242. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  243.  
  244. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,16),1,4).*var_x_all(:,5:8)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  245. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(1:4,1),'Season_',''),'_weather4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  246.  
  247. %Interaction term: hour * season
  248. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,5),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  249. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  250.  
  251. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,6),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  252. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season2')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  253.  
  254. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,7),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  255. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season3')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  256.  
  257. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,8),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  258. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_season4')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  259.  
  260. %Interaction term: hour * workingday:
  261. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  262. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_workingday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  263.  
  264. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  265. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_workingday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  266.  
  267. %Interaction term: hour * holiday
  268. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,9),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  269. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_holiday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  270.  
  271. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,10),1,24).*var_x_all(:,17:40)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  272. headers_num_inter = [headers_num_inter strcat(strrep(hour_headers','hour_',''),'_holiday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  273.  
  274. %Interaction term: workingday * weather
  275. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  276. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_Workingday0')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  277.  
  278. matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  279. headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_Workingday1')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  280.  
  281. %Interaction term: holiday * weather
  282. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,9),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  283. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_notholiday')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  284.  
  285. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,10),1,4).*var_x_all(:,13:16)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  286. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(9:12,1),'Weather_',''),'_holiday')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  287.  
  288. %Interaction terms: casual * weekend
  289. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,11),1,1).*alldata_by_hour(:,5)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  290. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(7,1),'Workingday_',''),'_casual')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  291.  
  292. %matrix_num_inter = [matrix_num_inter repmat(var_x_all(:,12),1,1).*alldata_by_hour(:,5)]; % repmat is needed to balance inner dimensions of a vector and a matrix
  293. %headers_num_inter = [headers_num_inter strcat(strrep(dummy_heads(8,1),'Workingday_',''),'_casual')']; % strcat is for converging of strings, strrep is for replacing of some parts of strings
  294.  
  295. b2_train = fitlm(matrix_num_inter(1:traindata_border,:), var_x(1:traindata_border,7), 'Intercept', true, 'PredictorVars',headers_num_inter, 'RobustOpt', 'off');
  296. wape_b2_train = sum(abs(var_x(1:traindata_border,7) - b2_train.Fitted))/sum(var_x(1:traindata_border,7))
  297.  
  298. %Test model:
  299. b2_train_predicted = predict(b2_train,matrix_num_inter(traindata_border+1:end,:));
  300. wape_b2_train_predicted = sum(abs(var_x(traindata_border+1:end,7) - b2_train_predicted))/sum(var_x(traindata_border+1:end,7));
  301.  
  302. %Split data by day:
  303. headers_day = ["day1","day2","day3","day4","day5","day6","day7","day8","day9","day10","day11","day12","day13","day14","day15","day16","day17","day18","day19"]
  304. data_by_day = [matrix_num_inter matrix_day var_x(:,7)]
  305. data_day_header = [headers_num_inter headers_day "count"]
  306.  
  307. data_day1 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+1)>0),1:end)
  308. data_day2 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+2)>0),1:end)
  309. data_day3 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+3)>0),1:end)
  310. data_day4 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+4)>0),1:end)
  311. data_day5 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+5)>0),1:end)
  312. data_day6 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+6)>0),1:end)
  313. data_day7 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+7)>0),1:end)
  314. data_day8 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+8)>0),1:end)
  315. data_day9 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+9)>0),1:end)
  316. data_day10 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+10)>0),1:end)
  317. data_day11 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+11)>0),1:end)
  318. data_day12 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+12)>0),1:end)
  319. data_day13 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+13)>0),1:end)
  320. data_day14 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+14)>0),1:end)
  321. data_day15 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+15)>0),1:end)
  322. data_day16 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+16)>0),1:end)
  323. data_day17 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+17)>0),1:end)
  324. data_day18 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+18)>0),1:end)
  325. data_day19 = data_by_day(logical(data_by_day(:,length(headers_num_inter)+19)>0),1:end)
  326.  
  327. train_data = [data_day1;data_day2;data_day3;data_day4;data_day5;data_day6;data_day7;data_day8;data_day9;data_day10;data_day11;data_day12;data_day13;data_day14];
  328. test_data = [data_day15;data_day16;data_day17;data_day18;data_day19];
  329.  
  330. %Running Lasso:
  331. %[train_b2,train_fit2]=lasso(matrix_num_inter,var_x(:,7),'CV',10,'PredictorNames',headers_num_inter);
  332. %lassoPlot(train_b2,train_fit2,'PlotType','CV')
  333. %save('lasso.mat','train_b2','train_fit2')
  334. load lasso
  335. minMSE_b2 = train_fit2.PredictorNames(train_b2(:,train_fit2.IndexMinMSE)~=0);
  336. sparse_b2 = train_fit2.PredictorNames(train_b2(:,train_fit2.Index1SE)~=0);
  337.  
  338. %Running lasso with interaction terms:
  339. b2_lasso = fitlm(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'Intercept',true, 'PredictorVars',minMSE_b2);
  340. wape_b2_lasso = sum(abs(train_data(:,448) - b2_lasso.Fitted))/sum(train_data(:,448));
  341. %Running lasso for test data:
  342. y_pred_regr_lasso = predict(b2_lasso,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')));
  343. wape_pred_regr_lasso = sum(abs(test_data(:,448) - y_pred_regr_lasso))/sum(test_data(:,448));
  344. %% CART Model:
  345. %leafs = linspace(1,100,100);
  346. %rng('default')
  347. %N = numel(leafs);
  348. %err = zeros(N,1);
  349. %for n=1:N
  350. % t = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'CrossVal','On','MinLeaf',leafs(n));
  351. % err(n) = kfoldLoss(t);
  352. %end
  353. %figure;
  354. %plot(leafs,err,'LineWidth',1.5,'Color','r');
  355. %grid on
  356. %title('Defining of optimal minleaf')
  357. %xlabel('Min Leaf Size');
  358. %ylabel('cross-validated error');
  359.  
  360. %dtreefull = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'PredictorNames',minMSE_b2,'minleaf',round(leafs(find(err==min(err))')));
  361. %y_pred_tree = predict(dtreefull,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')));
  362. %wape_pred_tree = sum(abs(test_data(:,448) - y_pred_tree))/sum(test_data(:,448))
  363. % Resubstitution error is the difference between the response training data
  364. % and the predictions the tree makes of the response based on the input training data.
  365. % The resubstitution loss for a regression tree is the mean-squared error.
  366. % It indicates that a typical predictive error for the tree is about the square root of it.
  367. %resuberror_train = sqrt(resubLoss(dtreefull))
  368. %view(dtreefull,'Mode','Graph');
  369.  
  370. %wape_tree_in = zeros(1,numel(leafs));
  371. %wape_tree_out = zeros(1,numel(leafs));
  372. %for i = 1:numel(leafs)
  373. % plottree = fitrtree(train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')),train_data(:,448),'PredictorNames',minMSE_b2,'minleaf',leafs(i));
  374. % wape_tree_in(1,i) = sum(abs(train_data(:,448) - predict(plottree,train_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')))))/sum(train_data(:,448));
  375. % wape_tree_out(1,i) = sum(abs(test_data(:,448) - predict(plottree,test_data(:,logical(train_b2(:,train_fit2.IndexMinMSE)')))))/sum(test_data(:,448));
  376. %end
  377.  
  378. %figure;
  379. %plot(leafs,wape_tree_in,'LineWidth',1.75,'Color','r');
  380. %hold on
  381. %plot(leafs,wape_tree_out,'LineWidth',1.75,'Color','b');
  382. %grid on
  383. %title('Realization of WAPE in DT w.r.t. minleafsize')
  384. %xlabel('Min Leaf Size');
  385. %ylabel('WAPE, %');
  386. %legend('In-sample','Out-of-sample','Location','southeast')
Add Comment
Please, Sign In to add comment