hw1-2

require(data.table)
require(ggplot2)
require(MASS)
require(fBasics)
require(moments)
alpha = 0.05

#Question 1
# 1. Consider the daily stock returns of American Express (AXP), Caterpillar (CAT), and Starbucks
# (SBUX) from January 1999 to December 2008. The data are simple returns given in the file d-3stocks9908.txt (date, axp, cat, sbux).
# (a) Compute the sample mean, standard deviation, skewness, excess kurtosis, minimum, and maximum of each simple return series. (Hint: use the R command basicStats of fBasics)
# (b) Transform the simple returns to log returns. Compute the sample mean, standard deviation, skewness, excess kurtosis, minimum, and maximum of each log return series.
# (c) Test the null hypothesis that the mean of the log returns of AXP is zero. (Hint: use the R command t.test)
# (d) Obtain the histogram (with nclass=40) and sample density plot of the daily log returns of AXP stock.
# (e) Test if log return of AXP follows normal distribution by using at least two methods. (Need to  give explanation)

# Import Data
q1_data = fread("d-3stocks9908.txt",header=T)

# Part a
q1_data_stats = basicStats(q1_data)
sprintf("Getting Simple Returns")

for (stock in colnames(q1_data_stats)[-1]){
  print(sprintf("%s (Simple Returns): Mean = %s , Stdev = %s, Skewness = %s , Kurtosis = %s , Minimum = %s , Maximum = %s",
                stock , q1_data_stats["Mean",stock] , q1_data_stats["Stdev",stock] , q1_data_stats["Skewness",stock] , q1_data_stats["Kurtosis",stock] ,  q1_data_stats["Minimum",stock] ,  q1_data_stats["Maximum",stock] ))
}

# Part B
q1_logR = log(q1_data+1)
q1_logR_stats = basicStats(q1_logR)

for (stock in colnames(q1_logR_stats)[-1]){
  print(sprintf("%s (Log Returns): Mean = %s , Stdev = %s, Skewness = %s , Kurtosis = %s , Minimum = %s , Maximum = %s",
                stock , q1_logR_stats["Mean",stock] , q1_logR_stats["Stdev",stock] , q1_logR_stats["Skewness",stock] , q1_logR_stats["Kurtosis",stock] ,  q1_logR_stats["Minimum",stock] ,  q1_logR_stats["Maximum",stock] ))
}

# Part c
axp_pvalue = t.test(q1_logR$axp)$p.value
sprintf("AXP: The p-value is %s, hence we %s the null hypothesis that the mean is 0", axp_pvalue , if (axp_pvalue <= alpha) "Reject"  else "Do not reject")


# Part D

ggplot(data = q1_logR, aes(q1_logR$axp)) +
  geom_histogram(bins = 40, col="grey",  fill="green",  alpha = .2) +
  labs(title="Histogram of Daily Log Returns") +
  labs(x="Daily Log Returns", y="Count")

plot(density(q1_logR$axp))

# Part E
axp_jb_test = normalTest(q1_logR$axp,method="jb")
axp_jb_pvalue = axp_jb_test@test$p.value
sprintf("Jarque Bera Test returns the values for the 'Chi-squared' statistic with 2 degrees of freedom, and the asymptotic p-value.")
sprintf("P Value is %s, hence %s null hypothesis of normality", axp_jb_pvalue ,if (axp_jb_pvalue <= alpha) "Reject"  else "Do not reject" )

axp_sw_test = normalTest(q1_logR$axp,method="sw")
axp_sw_pvalue = axp_sw_test@test$p.value
sprintf("Shapiro Test returns the values for the 'W' statistic and the p-value")
sprintf("P Value is %s, hence %s null hypothesis of normality", axp_sw_pvalue ,if (axp_sw_pvalue <= alpha) "Reject"  else "Do not reject" )


#Question 2
# 2. Answer the same questions as Problem 1 but using monthly returns for General Motors (GM),
# CRSP value-weighted index (VW), CRSP equal-weighted index (EW) and S&P composite index
# from January 1975 to December 2008. The returns of the indexes include dividend distributions.
# Data file is m-gm3dx7508.txt (date, gm, vw, ew, sp).

# Import Data
q2_data <- fread("m-gm3dx7508.txt",header=T)

# Part a
q2_data_stats = basicStats(q2_data)


for (stock in colnames(q2_data_stats)[-1]){
  print(sprintf("%s (Simple Returns): Mean = %s , Stdev = %s, Skewness = %s , Kurtosis = %s , Minimum = %s , Maximum = %s",
                stock , q2_data_stats["Mean",stock] , q2_data_stats["Stdev",stock] , q2_data_stats["Skewness",stock] , q2_data_stats["Kurtosis",stock] ,  q2_data_stats["Minimum",stock] ,  q2_data_stats["Maximum",stock] ))
}

# Part B
q2_logR = log(q2_data+1)
q2_logR_stats = basicStats(q2_logR)

for (stock in colnames(q2_logR_stats)[-1]){
  print(sprintf("%s (Log Returns): Mean = %s , Stdev = %s, Skewness = %s , Kurtosis = %s , Minimum = %s , Maximum = %s",
                stock , q2_logR_stats["Mean",stock] , q2_logR_stats["Stdev",stock] , q2_logR_stats["Skewness",stock] , q2_logR_stats["Kurtosis",stock] ,  q2_logR_stats["Minimum",stock] ,  q2_logR_stats["Maximum",stock] ))
}


for (stock in colnames(q2_logR)[-1]){
  # Part c
  pvalue = t.test(q2_logR_stats[,stock])$p.value
  print(sprintf("%s: The p-value is %s, hence we %s the null hypothesis that the mean is 0", stock, pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject"))

  # Part D
  print( ggplot(data = q2_logR, aes(q2_logR[[stock]])) +
           geom_histogram(bins = 40, col="grey",  fill="green",  alpha = .2) +
           labs(title= cat("Histogram of Daily Log Returns")) +
           labs(x=paste0("Daily Log Returns for ", stock), y="Count"))

  plot(density(q2_logR[[stock]]))

  # Part E
  jb_test = normalTest(q2_logR[[stock]],method="jb")
  jb_pvalue = jb_test@test$p.value
  print(sprintf("Jarque Bera Test returns the values for the 'Chi-squared' statistic with 2 degrees of freedom, and the asymptotic p-value."))
  print(sprintf("P Value is %s, hence %s null hypothesis of normality", jb_pvalue ,if (jb_pvalue <= alpha) "Reject"  else "Do not reject" ))

  sw_test = normalTest(q2_logR[[stock]],method="sw")
  sw_pvalue = sw_test@test$p.value
  print(sprintf("Shapiro Test returns the values for the 'W' statistic and the p-value"))
  print(sprintf("P Value is %s, hence %s null hypothesis of normality", sw_pvalue ,if (sw_pvalue <= alpha) "Reject"  else "Do not reject" ))

}


# Question 3
# 3. Consider the monthly stock returns of value-weighted index (VW) from January 1975 to December 2008 in Problem 2.
# Perform the tests and draw conclusions using the 5% significance level.
# (a) Test H0: u = 0 versus Ha : μ != 0,where μ denotes the mean return.
# (b) Test H0: m3 = 0 versus Ha : m3 != 0, where m3 denotes the skewness.
# (c) Test H0: K = 3 versus Ha : K != 3, where K denotes the kurtosis.

# Part a
stock = "vw"
pvalue = t.test(q2_logR[[stock]]  ) $p.value
sprintf("%s: The p-value is %s, hence we %s the null hypothesis that the mean is 0", stock,  pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")

# Part b
pvalue = agostino.test(q2_logR[[stock]] )$p.value
sprintf("%s:: The p-value is %s, hence we %s the null hypothesis that skewness is 0",  stock, pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")

# Part c
pvalue = anscombe.test(q2_logR[[stock]] )$p.value
sprintf("%s:: The p-value is %s, hence we %s the null hypothesis that the kurtosis is 3", stock,  pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")


# Question 4
# 4. Consider the daily log returns of AXP stock from January 1999 to December 2008 as in
# Problem 1. Perform the following tests:
# (a) Test the null hypothesis that the skewness measure of the returns is zero;
# (b) Test the null hypothesis that the excess kurtosis of the returns is zero.

stock = "axp"
pvalue = agostino.test(q1_logR[[stock]] )$p.value
sprintf("%s:: The p-value is %s, hence we %s the null hypothesis that skewness is 0",  stock, pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")

# Part c
pvalue = anscombe.test(q1_logR[[stock]] )$p.value
sprintf("%s:: The p-value is %s, hence we %s the null hypothesis that the kurtosis is 3", stock,  pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")


#Question 5
# 5. Daily foreign exchange rates (spot rates) can be obtained from the Federal Reserve Bank in St
# Louis (FRED). The data are the noon buying rates in New York City certified by the Federal
# Reserve Bank of New York. Consider the exchange rates between the U.S. dollar and the Euro
# from January 4, 1999 to March 8, 2013. See the file d-exuseu.txt.
#
# (a) Compute the daily log return of the exchange rate.
# (b) Compute the sample mean, standard deviation, skewness, excess kurtosis, minimum, and
# maximum of the log returns of the exchange rate.
# (c) Obtain a density plot of the daily long returns of Dollar-Euro exchange rate.
# (d) Test H0: u = 0 versus Ha : u != 0, where μ denotes the mean of the daily log return of Dollar-Euro exchange rate.

# Par A
q5_data =  fread("d-exuseu.txt",header=T)
q5_logR = diff(log(q5_data$VALUE))

#5b
q5_stats = basicStats(q5_logR)
sprintf("FRED (Log Returns): Mean = %s , Stdev = %s, Skewness = %s , Kurtosis = %s , Minimum = %s , Maximum = %s",
        q5_stats["Mean",] , q5_stats["Stdev",] , q5_stats["Skewness",] , q5_stats["Kurtosis",] ,  q5_stats["Minimum",] ,  q5_stats["Maximum",] )

#5c
plot(density(q5_logR))
#5d

pvalue = t.test(q5_logR)$p.value
sprintf("%s: The p-value is %s, hence we %s the null hypothesis that the mean is 0", "Dollar Euro Exchange Return",  pvalue , if (pvalue <= alpha) "Reject"  else "Do not reject")

#Question 6
# 6. Consider SP500 and IBM data (in file ‘SP.csv’ and ‘IBM.csv’ respectively):
# (a) Find the log return of SP and IBM by using ‘Adjusted Price’
# (b) Merge the above two return series into a new dataset (hint: using ‘data.frame’) and plot them (y is IBM, x is SP)
# (c) Find the best linear model by using AIC as criterion (need to show all your models)

sp_data =  fread("SP.csv",header=T)
ibm_data =  fread("IBM.csv",header=T)

# Part A
sp_logR = diff(log(sp_data[["SP.Adjusted"]]))
ibm_logR = diff(log(ibm_data[["IBM.Adjusted"]]))

# Part B
new_table = data.table(y = ibm_logR , x = sp_logR)
qplot(new_table$y, new_table$x)


# Part C
aic_list = list()

# MODEL 1: Simple linear regression with intercept
m1 <- lm(y~x, new_table)
summary(m1)
aic_list = c(aic_list ,AIC(m1))

# MODEL 2: Simple linear regression with no intercept
m2 <- lm(y ~ -1 + x, new_table)
summary(m2)
aic_list = c(aic_list ,AIC(m2))


# Get index of all elements whose returns are non positive
idx = which(new_table$x <= 0)

# List with non positive returns, otherwise 0
nsp <- rep(0,length(new_table$x))
nsp[idx] = new_table$x[idx]

# List with 1 when non positive returns, otherwise 0
c1 <- rep(0,length(new_table$x))
c1[idx] = 1


new_table2 <- data.frame(y = new_table$y, x = new_table$x, c1, nsp)   # Show the resulting variables

# Model 3: Different intercepts for positive, and non positive returns
m3 <- lm(y ~ x + c1 , new_table2) #with different intercepts (alpha) for positive and negative SP log returns
summary(m3)
aic_list = c(aic_list ,AIC(m3))

# Model 4: Different coefficients for positive and non positive returns
m4 <- lm(y ~ x + nsp, new_table2) #with different coefficients (beta) for positive and negative SP log returns
summary(m4)
aic_list = c(aic_list ,AIC(m4))


# Model 5: Different coefficients and intercept for positive and non positive returns
m5 <- lm(y ~ x + c1 + nsp, new_table2)
summary(m5)
aic_list = c(aic_list ,AIC(m5))
idx = which.min(aic_list)
sprintf("The best model is model %s, with an AIC of %s" , idx , aic_list[idx])