SHARE
TWEET

Untitled

a guest Sep 16th, 2019 113 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. library(data.table)
  2. library(ggplot2)
  3. library(lubridate)
  4. library(zoo)
  5. library(dyn)
  6. library(reshape2)
  7. library(readxl)
  8. library(randomForest)
  9.  
  10. library(dplyr)
  11.  
  12. # import data
  13.  
  14. monthly = read_excel("I:/Data_work/PredictorData2018.xlsx",
  15.                      sheet = "Monthly")
  16. #View(PredictorData2018)
  17.  
  18. annual = read_excel("I:/Data_work/PredictorData2018.xlsx",
  19.                     sheet = "Annual")
  20. #View(annual)
  21.  
  22. # as.table data
  23. monthly <- as.data.table(monthly)
  24. annual  <- as.data.table(annual)
  25. monthly <- monthly[, Datum := ymd(Datum)]
  26. # date format 187101 == yyyymm in monthly, in annual[date_format] == yyyy
  27. #View(annual)
  28. #summary(annual)
  29.  
  30. ###################################
  31. ###################################
  32. annual <- annual[, IndexDiv := Index + D12]
  33. annual <- annual[, dp := log(D12) - log(Index)]
  34. annual <- annual[, ep := log(E12) - log(Index)]
  35. vec_dy <- c(NA, annual[2:nrow(annual), log(D12)] - annual[1:(nrow(annual)-1), log(Index)])
  36. annual <- annual[, dy := vec_dy]
  37. annual <- annual[, logret   :=c(NA,diff(log(Index)))]
  38. vec_logretdiv <- c(NA, annual[2:nrow(annual), log(IndexDiv)] - annual[1:(nrow(annual)-1), log(Index)])
  39. vec_logretdiv <- c(NA, log(annual[2:nrow(annual), IndexDiv]/annual[1:(nrow(annual)-1), Index]))
  40. annual <- annual[, logretdiv:=vec_logretdiv]
  41. annual <- annual[, logRfree := log(Rfree + 1)]
  42. annual <- annual[, rp_div   := logretdiv - logRfree]
  43. #Put it in time series (is needed in function get_statistics)
  44. ts_annual <- ts(annual, start=annual[1, Datum], end=annual[nrow(annual), Datum])
  45. plot(ts_annual[, c("rp_div", "dp", "dy")])
  46. ts_annual[-c(1),]
  47. ################################################
  48. ts = ts_annual[-c(1),]
  49. length(ts)
  50.  
  51. ts(ts)
  52. # Try the model inputed in format of a df
  53. #as.data.frame(ts)
  54. #ts
  55. #View(ts)
  56. #zoo(ts)
  57. #ts(ts)
  58. typeof(ts)
  59. typeof(ts_annual)
  60. # Check
  61. length(lag(ts_annual[, c("dp")],-1)) == length(lag(ts_annual[, c("dp")])) # TRUE 147*147
  62. array = as.array(lag(ts_annual[, c("dp")]))
  63. array
  64. dim(array)
  65. array1 = ts_annual[, c("dp")]
  66. array1
  67. dim(array1)
  68.  
  69. array_shifted = shift(array1, n=1, fill=NA, type="lag")
  70. array_shifted
  71. array_shifted[!is.na(array_shifted)]
  72.  
  73. array2 = as.array(lag(ts_annual[, c("dp")], -1))
  74. array2
  75. dim(array2)
  76. array3 = as.array(lag(ts_annual[, c("dp")], 1))
  77. array3
  78. dim(array3)
  79.  
  80. array4 = dyn$randomForest(rp_div ~ lag(ts_annual[, c("dp")], -1), data=ts_annual)
  81. array4$predicted
  82. length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts)$predicted)
  83. dim(array) == dim(array1) == dim(array2)
  84.  
  85. # Remove 1st OBS
  86. variable = lag(ts[-c(1), c("dp")], -1)
  87. variable
  88. # hardcoded yet
  89. get_statistics2 <- function(ts_df, indep, dep, h=1, start=1872, end=2018, est_periods_OOS = 20) {
  90.  
  91.   #### IS ANALYSIS
  92.  
  93.   #1. Historical mean model
  94.   avg   <- mean(window(ts_df, start, end)[, dep], na.rm=TRUE)
  95.   IS_error_N <- (window(ts_df, start, end)[, dep] - avg)
  96.  
  97.   #2. OLS model
  98.   reg <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1), data=window(ts_df, start, end))
  99.   IS_error_A <- reg$residuals
  100.   ###
  101.   #3. RF
  102.   rf = dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=window(ts_df, start, end))
  103.   #IS_error_RF =
  104.   ####OOS ANALYSIS
  105.   OOS_error_N <- numeric(end - start - est_periods_OOS)
  106.   OOS_error_A <- numeric(end - start - est_periods_OOS)
  107.   #Only use information that is available up to the time at which the forecast is made
  108.   j <- 0
  109.   for (i in (start + est_periods_OOS):(end-1)) {
  110.     j <- j + 1
  111.     #Get the actual ERP that you want to predict
  112.     actual_ERP <- as.numeric(window(ts_df, i+1, i+1)[, dep])
  113.    
  114.     #1. Historical mean model
  115.     OOS_error_N[j] <- actual_ERP - mean(window(ts_df, start, i)[, dep], na.rm=TRUE)
  116.    
  117.     #2. OLS model
  118.     reg_OOS <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1),
  119.                       data=window(ts_df, start, i))
  120.     #3. RF
  121.     rf_OOS = dyn$randomForest(rp_div ~ array_shifted, data=window(ts_df, start, i))
  122.     #Compute_error
  123.     df <- data.frame(x=as.numeric(window(ts_df, i, i)[, indep]))
  124.     names(df) <- indep
  125.     pred_ERP   <- predict.lm(reg_OOS, newdata=df)
  126.     pred_ERP_rf = predict(rf_OOS, newdata=df)
  127.     OOS_error_A[j] <-  pred_ERP - actual_ERP
  128.     OOS_error_RF[j] <-  pred_ERP_rf - actual_ERP
  129.   }
  130.  
  131.  
  132.   #Compute statistics
  133.   MSE_N <- mean(OOS_error_N^2)
  134.   MSE_A <- mean(OOS_error_A^2)
  135.   T <- length(!is.na(ts_df[, dep]))
  136.   OOS_R2  <- 1 - MSE_A/MSE_N
  137.   #
  138.   #Is the -1 enough (maybe -2 needed because of lag)?
  139.   OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
  140.   dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
  141.   ##
  142.  
  143.   #### CREATE PLOT
  144.   IS  <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
  145.   OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
  146.   df  <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
  147.                     IS=IS[(1 + est_periods_OOS):length(IS)],
  148.                     OOS=OOS) #Because you lose one observation due to the lag
  149.   #Shift IS errors vertically, so that the IS line begins
  150.   # at zero on the date of first OOS prediction. (see Goyal/Welch (2008, p. 1465))
  151.   df$IS <- df$IS - df$IS[1]
  152.   df  <- melt(df, id.var="x")
  153.   plotGG <- ggplot(df) +
  154.     geom_line(aes(x=x, y=value,color=variable)) +
  155.     geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
  156.               aes(xmin=1973, xmax=1975,ymin=-0.2,ymax=0.2),
  157.               fill='red',
  158.               alpha=0.1) +
  159.     scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
  160.     scale_x_continuous('Year')
  161.   ##
  162.  
  163.   return(list(IS_error_N = IS_error_N,
  164.               IS_error_A = reg$residuals,
  165.               OOS_error_N = OOS_error_N,
  166.               OOS_error_A = OOS_error_A,
  167.               IS_R2 = summary(reg)$r.squared,
  168.               IS_aR2 = summary(reg)$adj.r.squared,
  169.               OOS_R2  = OOS_R2,
  170.               OOS_oR2 = OOS_oR2,
  171.               dRMSE = dRMSE,
  172.               plotGG = plotGG,
  173.               print.rf = print(rf),
  174.               residuals = ts[, c("rp_div")] - rf$predicted))
  175.  
  176. }
  177.  
  178. dp_stat2 <- get_statistics2(ts_annual, "dp", "rp_div", start=1872)
  179. dp_stat2$plotGG
  180. dp_stat2$print.rf
  181.  
  182. length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts_annual)) == lag(ts[, c("dp")], -1)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top