Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(ggplot2)
- library(lubridate)
- library(zoo)
- library(dyn)
- library(reshape2)
- library(readxl)
- library(randomForest)
- library(dplyr)
- # import data
- monthly = read_excel("I:/Data_work/PredictorData2018.xlsx",
- sheet = "Monthly")
- #View(PredictorData2018)
- annual = read_excel("I:/Data_work/PredictorData2018.xlsx",
- sheet = "Annual")
- #View(annual)
- # as.table data
- monthly <- as.data.table(monthly)
- annual <- as.data.table(annual)
- monthly <- monthly[, Datum := ymd(Datum)]
- # date format 187101 == yyyymm in monthly, in annual[date_format] == yyyy
- #View(annual)
- #summary(annual)
- ###################################
- ###################################
- annual <- annual[, IndexDiv := Index + D12]
- annual <- annual[, dp := log(D12) - log(Index)]
- annual <- annual[, ep := log(E12) - log(Index)]
- vec_dy <- c(NA, annual[2:nrow(annual), log(D12)] - annual[1:(nrow(annual)-1), log(Index)])
- annual <- annual[, dy := vec_dy]
- annual <- annual[, logret :=c(NA,diff(log(Index)))]
- vec_logretdiv <- c(NA, annual[2:nrow(annual), log(IndexDiv)] - annual[1:(nrow(annual)-1), log(Index)])
- vec_logretdiv <- c(NA, log(annual[2:nrow(annual), IndexDiv]/annual[1:(nrow(annual)-1), Index]))
- annual <- annual[, logretdiv:=vec_logretdiv]
- annual <- annual[, logRfree := log(Rfree + 1)]
- annual <- annual[, rp_div := logretdiv - logRfree]
- #Put it in time series (is needed in function get_statistics)
- ts_annual <- ts(annual, start=annual[1, Datum], end=annual[nrow(annual), Datum])
- plot(ts_annual[, c("rp_div", "dp", "dy")])
- ts_annual[-c(1),]
- ################################################
- ts = ts_annual[-c(1),]
- length(ts)
- ts(ts)
- # Try the model inputed in format of a df
- #as.data.frame(ts)
- #ts
- #View(ts)
- #zoo(ts)
- #ts(ts)
- typeof(ts)
- typeof(ts_annual)
- # Check
- length(lag(ts_annual[, c("dp")],-1)) == length(lag(ts_annual[, c("dp")])) # TRUE 147*147
- array = as.array(lag(ts_annual[, c("dp")]))
- array
- dim(array)
- array1 = ts_annual[, c("dp")]
- array1
- dim(array1)
- array_shifted = shift(array1, n=1, fill=NA, type="lag")
- array_shifted
- array_shifted[!is.na(array_shifted)]
- array2 = as.array(lag(ts_annual[, c("dp")], -1))
- array2
- dim(array2)
- array3 = as.array(lag(ts_annual[, c("dp")], 1))
- array3
- dim(array3)
- array4 = dyn$randomForest(rp_div ~ lag(ts_annual[, c("dp")], -1), data=ts_annual)
- array4$predicted
- length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts)$predicted)
- dim(array) == dim(array1) == dim(array2)
- # Remove 1st OBS
- variable = lag(ts[-c(1), c("dp")], -1)
- variable
- # hardcoded yet
- get_statistics2 <- function(ts_df, indep, dep, h=1, start=1872, end=2018, est_periods_OOS = 20) {
- #### IS ANALYSIS
- #1. Historical mean model
- avg <- mean(window(ts_df, start, end)[, dep], na.rm=TRUE)
- IS_error_N <- (window(ts_df, start, end)[, dep] - avg)
- #2. OLS model
- reg <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1), data=window(ts_df, start, end))
- IS_error_A <- reg$residuals
- ###
- #3. RF
- rf = dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=window(ts_df, start, end))
- #IS_error_RF =
- ####OOS ANALYSIS
- OOS_error_N <- numeric(end - start - est_periods_OOS)
- OOS_error_A <- numeric(end - start - est_periods_OOS)
- #Only use information that is available up to the time at which the forecast is made
- j <- 0
- for (i in (start + est_periods_OOS):(end-1)) {
- j <- j + 1
- #Get the actual ERP that you want to predict
- actual_ERP <- as.numeric(window(ts_df, i+1, i+1)[, dep])
- #1. Historical mean model
- OOS_error_N[j] <- actual_ERP - mean(window(ts_df, start, i)[, dep], na.rm=TRUE)
- #2. OLS model
- reg_OOS <- dyn$lm(eval(parse(text=dep)) ~ lag(eval(parse(text=indep)), -1),
- data=window(ts_df, start, i))
- #3. RF
- rf_OOS = dyn$randomForest(rp_div ~ array_shifted, data=window(ts_df, start, i))
- #Compute_error
- df <- data.frame(x=as.numeric(window(ts_df, i, i)[, indep]))
- names(df) <- indep
- pred_ERP <- predict.lm(reg_OOS, newdata=df)
- pred_ERP_rf = predict(rf_OOS, newdata=df)
- OOS_error_A[j] <- pred_ERP - actual_ERP
- OOS_error_RF[j] <- pred_ERP_rf - actual_ERP
- }
- #Compute statistics
- MSE_N <- mean(OOS_error_N^2)
- MSE_A <- mean(OOS_error_A^2)
- T <- length(!is.na(ts_df[, dep]))
- OOS_R2 <- 1 - MSE_A/MSE_N
- #
- #Is the -1 enough (maybe -2 needed because of lag)?
- OOS_oR2 <- OOS_R2 - (1-OOS_R2)*(reg$df.residual)/(T - 1)
- dRMSE <- sqrt(MSE_N) - sqrt(MSE_A)
- ##
- #### CREATE PLOT
- IS <- cumsum(IS_error_N[2:length(IS_error_N)]^2)-cumsum(IS_error_A^2)
- OOS <- cumsum(OOS_error_N^2)-cumsum(OOS_error_A^2)
- df <- data.frame(x=seq.int(from=start + 1 + est_periods_OOS, to=end),
- IS=IS[(1 + est_periods_OOS):length(IS)],
- OOS=OOS) #Because you lose one observation due to the lag
- #Shift IS errors vertically, so that the IS line begins
- # at zero on the date of first OOS prediction. (see Goyal/Welch (2008, p. 1465))
- df$IS <- df$IS - df$IS[1]
- df <- melt(df, id.var="x")
- plotGG <- ggplot(df) +
- geom_line(aes(x=x, y=value,color=variable)) +
- geom_rect(data=data.frame(),#Needed by ggplot2, otherwise not transparent
- aes(xmin=1973, xmax=1975,ymin=-0.2,ymax=0.2),
- fill='red',
- alpha=0.1) +
- scale_y_continuous('Cumulative SSE Difference', limits=c(-0.2, 0.2)) +
- scale_x_continuous('Year')
- ##
- return(list(IS_error_N = IS_error_N,
- IS_error_A = reg$residuals,
- OOS_error_N = OOS_error_N,
- OOS_error_A = OOS_error_A,
- IS_R2 = summary(reg)$r.squared,
- IS_aR2 = summary(reg)$adj.r.squared,
- OOS_R2 = OOS_R2,
- OOS_oR2 = OOS_oR2,
- dRMSE = dRMSE,
- plotGG = plotGG,
- print.rf = print(rf),
- residuals = ts[, c("rp_div")] - rf$predicted))
- }
- dp_stat2 <- get_statistics2(ts_annual, "dp", "rp_div", start=1872)
- dp_stat2$plotGG
- dp_stat2$print.rf
- length(dyn$randomForest(rp_div ~ lag(ts[, c("dp")], -1), data=ts_annual)) == lag(ts[, c("dp")], -1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement