Untitled

library(tseries)
library(forecast)
library(cluster)
### In case it is your first time in R, install them as: ###
#install.packages('tseries')
#install.packages('forecast')
#install.packages('cluster')

path <- '/Users/heidaoskpetursdottir/Documents/DecisionMaking/'

PassengersData <- na.omit(read.csv(paste0(path,'passengers.csv'),sep=','))$Passengers.thousands.

DataSet <- PassengersData

# Use the last year as test set (validation set) but remember that you need to redict for the following year, so you wont have the real data to compare.

smp_size <- length(DataSet)-12
train_ind <- c(1:smp_size)
train <- DataSet[train_ind]
test <- DataSet[-train_ind]

plot(train,col='blue',pch=20,main = '',xlab = 'Time [months]',ylab='# thousands of passengers',xlim=c(1,length(train)),ylim = c(min(DataSet),max(DataSet)))
lines(train,col='blue',lwd=1)

#Data for number of passengers during the past 11 years

plot(train,col='blue',pch=20,main = '',xlab = 'Time [months]',ylab='# thousands of passengers',xlim=c(1,length(DataSet)),ylim = c(min(DataSet),max(DataSet)))
lines(train,col='blue',lwd=1)
points(c((length(train)+1):(length(DataSet))),test,pch=20,col='black')
lines(c((length(train)+1):(length(DataSet))),test,lwd=1,col='black')
abline(v=length(train),col='grey')
legend('topleft',legend=c('Training Set','Test Set'),col=c('blue','black'),lwd=c(2,2))

##############################
### TIME SERIES ANLYSIS ######
##############################

# Plot ACF and PACF to check stationarity and correlations:
acf(train,main='ACF for Data',length(train)/2)
pacf(train,main='PACF for Data',length(train)/2)

# Apply transformations if required:
#log(train)
#diff(train,1)
#diff(log(train),1)
#diff(log(train),S)

###########################################################################################
# choose model Parameters
# Redifine train set
train2 <- diff(diff(log(train),12),1) # differentiate once for the mean and once for the seasonality.
# log to remove the variance. We want all our data to be random noice. We are removing all structure we have in our data to get the random walk to see the correlation between the lacks.

acf(train2,main='ACF for Data after differentiation',length(train)/2)
# look at the signficant lag
pacf(train2,main='PACF for Data after differentiation',length(train)/2)

plot(train2,col="blue",pch=20,main = "",xlab = "Time [months]",ylab="# thousands of passengers",xlim=c(1,length(DataSet)),ylim = c(min(diff(DataSet)),max(diff(DataSet))))
lines(train2,col="blue",lwd=1)

###########################################################################################
# Create ARIMA order=(p,d,q) or SARIMA order=(p,d,q)x(P,D,Q)S:
  p = 1 # AR(p)
  d = 1 # differentiate order
  q = 1 # MA(q)
  # Seasonal
  S = 12# Seasonality
  P = 1 # AR(P) Seasonal
  D = 1 # differentiate order Seasonal
  Q = 1 # MA(Q) Seasonal if there is a spike in 12 then 1, if in 24 then 2...
###########################################################################################
#  train2 %>%
#    Arima(order=c(p,d,q), seasonal=c(P,D,Q)) %>%
#    residuals() %>% ggtsdisplay()
#
#  Arima(train2, order=c(p,d,q), seasonal=c(P,D,Q))

TS_Model <- arima(log(train),order = c(p,d,q), seasonal = list(order = c(P,D,Q), period = S))
# Check residuals of the model:
hist(TS_Model$residuals,prob = T,breaks = 20,col="deepskyblue1",main="Histogram residuals")
curve(dnorm(x, mean(TS_Model$residuals), sd(TS_Model$residuals)), add=TRUE, col="red", lwd=2)

qqnorm(TS_Model$residuals,main="Q-Q plot residuals")
qqline(TS_Model$residuals)

plot(c(fitted(TS_Model)),c(TS_Model$residuals),pch=20,col="red",xlab = "Fitted Values",ylab="Residuals",main="Residual vs Fitted residuals")
abline(h=0)

acf(TS_Model$residuals,length(train)/2,main="ACF for residuals")

##############################################
### LETS MAKE PREDICTIONS AND PLOT THEM ######
##############################################

# Predict n.ahead steps:
Predictions <- predict(TS_Model, n.ahead = length(test)+12)
# Plot predictions:
plot(train,type = 'l',col='blue',lwd=2,main = 'Fill Data',xlab = 'Time',ylab='Data',xlim=c(1,length(DataSet)),ylim = c(0,max(DataSet)))
lines(c((length(train)+1):(length(DataSet)+12)),test,lwd=2,col='gray60')
lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred),lwd=1,col='red')
lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred+Predictions$se),lwd=1,lty=2,col='red')
lines(c((length(train)+1):(length(DataSet)+12)),exp(Predictions$pred-Predictions$se),lwd=1,lty=2,col='red')
abline(v=length(train),col='grey')
legend('topleft',legend=c('Past observations','Real Observations','Mean Forecast','95% Conf. Intervals Forecast'),col=c('blue','gray60','red','red'),lwd=c(2,2,1,1),lty=c(1,1,1,2))