Advertisement
Guest User

Untitled

a guest
Sep 16th, 2019
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 6.15 KB | None | 0 0
  1. library(data.table)
  2. library(ggplot2)
  3. library(lubridate)
  4. library(zoo)
  5. library(dyn)
  6. library(reshape2)
  7. library(readxl)
  8. library(randomForest)
  9.  
  10. library(dplyr)
  11.  
  12. # import data
  13.  
  14. monthly = read_excel("I:/Data_work/PredictorData2018.xlsx",
  15.                      sheet = "Monthly")
  16. #View(PredictorData2018)
  17.  
  18. annual = read_excel("I:/Data_work/PredictorData2018.xlsx",
  19.                     sheet = "Annual")
  20. #View(annual)
  21.  
  22. # as.table data
  23. monthly <- as.data.table(monthly)
  24. annual  <- as.data.table(annual)
  25. monthly <- monthly[, Datum := ymd(Datum)]
  26. # date format 187101 == yyyymm in monthly, in annual[date_format] == yyyy
  27. #View(annual)
  28. #summary(annual)
  29.  
  30. ###################################
  31. ###################################
  32. annual <- annual[, IndexDiv := Index + D12]
  33. annual <- annual[, dp := log(D12) - log(Index)]
  34. annual <- annual[, ep := log(E12) - log(Index)]
  35. vec_dy <- c(NA, annual[2:nrow(annual), log(D12)] - annual[1:(nrow(annual)-1), log(Index)])
  36. annual <- annual[, dy := vec_dy]
  37. annual <- annual[, logret   :=c(NA,diff(log(Index)))]
  38. vec_logretdiv <- c(NA, annual[2:nrow(annual), log(IndexDiv)] - annual[1:(nrow(annual)-1), log(Index)])
  39. vec_logretdiv <- c(NA, log(annual[2:nrow(annual), IndexDiv]/annual[1:(nrow(annual)-1), Index]))
  40. annual <- annual[, logretdiv:=vec_logretdiv]
  41. annual <- annual[, logRfree := log(Rfree + 1)]
  42. annual <- annual[, rp_div   := logretdiv - logRfree]
  43. #Put it in time series (is needed in function get_statistics)
  44. ts_annual <- ts(annual, start=annual[1, Datum], end=annual[nrow(annual), Datum])
  45. plot(ts_annual[, c("rp_div", "dp", "dy")])
  46. ts_annual[-c(1),]
  47. ################################################
  48. ts = ts_annual[-c(1),]
  49. length(ts)
  50.  
  51. ts(ts)
  52. # Try the model inputed in format of a df
  53. #as.data.frame(ts)
  54. #ts
  55. #View(ts)
  56. #zoo(ts)
  57. #ts(ts)
  58. typeof(ts)
  59. typeof(ts_annual)
  60. # Check
  61. length(lag(ts_annual[, c("dp")],-1)) == length(lag(ts_annual[, c("dp")])) # TRUE 147*147
  62. array = as.array(lag(ts_annual[, c("dp")]))
  63. array
  64. dim(array)
  65. array1 = ts_annual[, c("dp")]
  66. array1
  67. dim(array1)
  68.  
  69. array_shifted = shift(array1, n=1, fill=NA, type="lag")
  70. array_shifted
  71. array_shifted[!is.na(array_shifted)]
  72.  
  73. array2 = as.array(lag(ts_annual[, c("dp")], -1))
  74. array2
  75. dim(array2)
  76. array3 = as.array(lag(ts_annual[, c("dp")], 1))
  77. array3
  78. dim(array3)
  79.  
  80. array4 = dyn$randomForest(rp_div ~ lag(ts_annual[, c("dp")], -1), data=ts_annual)
  81. array4$predicted
  82. length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts)$predicted)
  83. dim(array) == dim(array1) == dim(array2)
  84.  
  85. # Remove 1st OBS
  86. variable = lag(ts[-c(1), c("dp")], -1)
  87. variable
  88. # hardcoded yet
  89. get_statistics2 <- function(ts_df, indep, dep, h=1, start=1872, end=2018, est_periods_OOS = 20) {
  90.  
  91.   #### IS ANALYSIS
  92.  
  93.   #1. Historical mean model
  94.   avg   <- mean(window(ts_df, start, end)[, dep], na.rm=TRUE)
  95.   IS_error_N <- (window(ts_df, start, end)[, dep] - avg)
  96.  
  97.   #2. OLS model
  98.   reg <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1), data=window(ts_df, start, end))
  99.   IS_error_A <- reg$residuals
  100.   ###
  101.   #3. RF
  102.   rf = dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=window(ts_df, start, end))
  103.   #IS_error_RF =
  104.   ####OOS ANALYSIS
  105.   OOS_error_N <- numeric(end - start - est_periods_OOS)
  106.   OOS_error_A <- numeric(end - start - est_periods_OOS)
  107.   #Only use information that is available up to the time at which the forecast is made
  108.   j <- 0
  109.   for (i in (start + est_periods_OOS):(end-1)) {
  110.     j <- j + 1
  111.     #Get the actual ERP that you want to predict
  112.     actual_ERP <- as.numeric(window(ts_df, i+1, i+1)[, dep])
  113.    
  114.     #1. Historical mean model
  115.     OOS_error_N[j] <- actual_ERP - mean(window(ts_df, start, i)[, dep], na.rm=TRUE)
  116.    
  117.     #2. OLS model
  118.     reg_OOS <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1),
  119.                       data=window(ts_df, start, i))
  120.     #3. RF
  121.     rf_OOS = dyn$randomForest(rp_div ~ array_shifted, data=window(ts_df, start, i))
  122.     #Compute_error
  123.     df <- data.frame(x=as.numeric(window(ts_df, i, i)[, indep]))
  124.     names(df) <- indep
  125.     pred_ERP   <- predict.lm(reg_OOS, newdata=df)
  126.     pred_ERP_rf = predict(rf_OOS, newdata=df)
  127.     OOS_error_A[j] <-  pred_ERP - actual_ERP
  128.     OOS_error_RF[j] <-  pred_ERP_rf - actual_ERP
  129.   }
  130.  
  131.  
  132.   #Compute statistics
  133.   MSE_N <- mean(OOS_error_N^2)
  134.   MSE_A <- mean(OOS_error_A^2)
  135.   T <- length(!is.na(ts_df[, dep]))
  136.   OOS_R2  <- 1 - MSE_A/MSE_N
  137.   #
  138.   #Is the -1 enough (maybe -2 needed because of lag)?
  139.   OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
  140.   dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
  141.   ##
  142.  
  143.   #### CREATE PLOT
  144.   IS  <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
  145.   OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
  146.   df  <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
  147.                     IS=IS[(1 + est_periods_OOS):length(IS)],
  148.                     OOS=OOS) #Because you lose one observation due to the lag
  149.   #Shift IS errors vertically, so that the IS line begins
  150.   # at zero on the date of first OOS prediction. (see Goyal/Welch (2008, p. 1465))
  151.   df$IS <- df$IS - df$IS[1]
  152.   df  <- melt(df, id.var="x")
  153.   plotGG <- ggplot(df) +
  154.     geom_line(aes(x=x, y=value,color=variable)) +
  155.     geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
  156.               aes(xmin=1973, xmax=1975,ymin=-0.2,ymax=0.2),
  157.               fill='red',
  158.               alpha=0.1) +
  159.     scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
  160.     scale_x_continuous('Year')
  161.   ##
  162.  
  163.   return(list(IS_error_N = IS_error_N,
  164.               IS_error_A = reg$residuals,
  165.               OOS_error_N = OOS_error_N,
  166.               OOS_error_A = OOS_error_A,
  167.               IS_R2 = summary(reg)$r.squared,
  168.               IS_aR2 = summary(reg)$adj.r.squared,
  169.               OOS_R2  = OOS_R2,
  170.               OOS_oR2 = OOS_oR2,
  171.               dRMSE = dRMSE,
  172.               plotGG = plotGG,
  173.               print.rf = print(rf),
  174.               residuals = ts[, c("rp_div")] - rf$predicted))
  175.  
  176. }
  177.  
  178. dp_stat2 <- get_statistics2(ts_annual, "dp", "rp_div", start=1872)
  179. dp_stat2$plotGG
  180. dp_stat2$print.rf
  181.  
  182. length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts_annual)) == lag(ts[, c("dp")], -1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement