Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tseries)
- library(forecast)
- library(cluster)
- ### In case it is your first time in R, install them as: ###
- #install.packages('tseries')
- #install.packages('forecast')
- #install.packages('cluster')
- path <- '/Users/heidaoskpetursdottir/Documents/DecisionMaking/'
- PassengersData <- na.omit(read.csv(paste0(path,'passengers.csv'),sep=','))$Passengers.thousands.
- DataSet <- PassengersData
- # Use the last year as test set (validation set) but remember that you need to redict for the following year, so you wont have the real data to compare.
- smp_size <- length(DataSet)-12
- train_ind <- c(1:smp_size)
- train <- DataSet[train_ind]
- test <- DataSet[-train_ind]
- plot(train,col='blue',pch=20,main = '',xlab = 'Time [months]',ylab='# thousands of passengers',xlim=c(1,length(train)),ylim = c(min(DataSet),max(DataSet)))
- lines(train,col='blue',lwd=1)
- #Data for number of passengers during the past 11 years
- plot(train,col='blue',pch=20,main = '',xlab = 'Time [months]',ylab='# thousands of passengers',xlim=c(1,length(DataSet)),ylim = c(min(DataSet),max(DataSet)))
- lines(train,col='blue',lwd=1)
- points(c((length(train)+1):(length(DataSet))),test,pch=20,col='black')
- lines(c((length(train)+1):(length(DataSet))),test,lwd=1,col='black')
- abline(v=length(train),col='grey')
- legend('topleft',legend=c('Training Set','Test Set'),col=c('blue','black'),lwd=c(2,2))
- ##############################
- ### TIME SERIES ANLYSIS ######
- ##############################
- # Plot ACF and PACF to check stationarity and correlations:
- acf(train,main='ACF for Data',length(train)/2)
- pacf(train,main='PACF for Data',length(train)/2)
- # Apply transformations if required:
- #log(train)
- #diff(train,1)
- #diff(log(train),1)
- #diff(log(train),S)
- ###########################################################################################
- # choose model Parameters
- # Redifine train set
- train2 <- diff(diff(log(train),12),1) # differentiate once for the mean and once for the seasonality.
- # log to remove the variance. We want all our data to be random noice. We are removing all structure we have in our data to get the random walk to see the correlation between the lacks.
- acf(train2,main='ACF for Data after differentiation',length(train)/2)
- # look at the signficant lag
- pacf(train2,main='PACF for Data after differentiation',length(train)/2)
- plot(train2,col="blue",pch=20,main = "",xlab = "Time [months]",ylab="# thousands of passengers",xlim=c(1,length(DataSet)),ylim = c(min(diff(DataSet)),max(diff(DataSet))))
- lines(train2,col="blue",lwd=1)
- ###########################################################################################
- # Create ARIMA order=(p,d,q) or SARIMA order=(p,d,q)x(P,D,Q)S:
- p = 1 # AR(p)
- d = 1 # differentiate order
- q = 1 # MA(q)
- # Seasonal
- S = 12# Seasonality
- P = 1 # AR(P) Seasonal
- D = 1 # differentiate order Seasonal
- Q = 1 # MA(Q) Seasonal if there is a spike in 12 then 1, if in 24 then 2...
- ###########################################################################################
- # train2 %>%
- # Arima(order=c(p,d,q), seasonal=c(P,D,Q)) %>%
- # residuals() %>% ggtsdisplay()
- #
- # Arima(train2, order=c(p,d,q), seasonal=c(P,D,Q))
- TS_Model <- arima(log(train),order = c(p,d,q), seasonal = list(order = c(P,D,Q), period = S))
- # Check residuals of the model:
- hist(TS_Model$residuals,prob = T,breaks = 20,col="deepskyblue1",main="Histogram residuals")
- curve(dnorm(x, mean(TS_Model$residuals), sd(TS_Model$residuals)), add=TRUE, col="red", lwd=2)
- qqnorm(TS_Model$residuals,main="Q-Q plot residuals")
- qqline(TS_Model$residuals)
- plot(c(fitted(TS_Model)),c(TS_Model$residuals),pch=20,col="red",xlab = "Fitted Values",ylab="Residuals",main="Residual vs Fitted residuals")
- abline(h=0)
- acf(TS_Model$residuals,length(train)/2,main="ACF for residuals")
- ##############################################
- ### LETS MAKE PREDICTIONS AND PLOT THEM ######
- ##############################################
- # Predict n.ahead steps:
- Predictions <- predict(TS_Model, n.ahead = length(test)+12)
- # Plot predictions:
- plot(train,type = 'l',col='blue',lwd=2,main = 'Fill Data',xlab = 'Time',ylab='Data',xlim=c(1,length(DataSet)),ylim = c(0,max(DataSet)))
- lines(c((length(train)+1):(length(DataSet)+12)),test,lwd=2,col='gray60')
- lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred),lwd=1,col='red')
- lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred+Predictions$se),lwd=1,lty=2,col='red')
- lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred-Predictions$se),lwd=1,lty=2,col='red')
- abline(v=length(train),col='grey')
- legend('topleft',legend=c('Past observations','Real Observations','Mean Forecast','95% Conf. Intervals Forecast'),col=c('blue','gray60','red','red'),lwd=c(2,2,1,1),lty=c(1,1,1,2))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement