Correlation of Tesla share quote and Robintrack popularity

# robintrack_correlation.R
#
# FC, 18.07.2019
#
# env.: R version 3.6.0 (2019-04-26) -- "Planting of a Tree"
#
# Platform: x86_64-w64-mingw32/x64 (64-bit)
#
#
# test: OK
#

library(chron) # loads as.POSIXlt, as.POSIXct
library(tidyquant) # loads tidyquant, tidyverse, lubridate, xts, quantmod, TTRlibrary(xts)
library(cowplot)
library(scales)  # for scale_y_continuous()


###############################################
## user input:
# how many years of daily quotes:
curr1 <- c('USD')
tckr1 <- c('TSLA')
security_name1 <- paste("Tesla (", tckr1, ")", sep ="")
###############################################


# class(popularity): data.frame
popularity <- read.csv("Robintrack popularity, Tesla, 2018-2019.csv", header = T, dec = ".", sep = ";")
# input:
# timestamp;popularity
# 2018-05-02 04:53:19 UTC;81038
# 2018-05-02 06:38:31 UTC;81038
# 2018-05-03 00:34:51 UTC;80137
# 2018-05-03 06:33:24 UTC;80137
# 2018-05-03 06:48:25 UTC;80137
# 2018-05-03 07:06:33 UTC;80137
# 2018-05-03 07:33:50 UTC;80137
# 2018-05-03 07:43:02 UTC;80137
# 2018-05-03 11:19:16 UTC;80138
# 2018-05-03 12:42:40 UTC;80138
# 2018-05-03 13:42:40 UTC;80149
# 2018-05-03 14:42:41 UTC;81223

# > head(popularity)
#                 timestamp popularity
# 1 2018-05-02 04:53:19 UTC      81038
# 2 2018-05-02 06:38:31 UTC      81038
# 3 2018-05-03 00:34:51 UTC      80137
# 4 2018-05-03 06:33:24 UTC      80137
# 5 2018-05-03 06:48:25 UTC      80137
# 6 2018-05-03 07:06:33 UTC      80137
# >


times1 <- as.POSIXlt(popularity$timestamp, tz = "UTC")

df1 <- data.frame(time = times1, popularity = popularity$popularity)
# > head(df1)
#                  time popularity
# 1 2018-05-02 04:53:19      81038
# 2 2018-05-02 06:38:31      81038
# 3 2018-05-03 00:34:51      80137
# 4 2018-05-03 06:33:24      80137
# 5 2018-05-03 06:48:25      80137
# 6 2018-05-03 07:06:33      80137
#


# keep only the last popularity data for a day:
df1_xts <- as.xts(df1, order.by = df1$time)
# > head(df1_xts)
#                     time                  popularity
# 2018-05-02 04:53:19 "2018-05-02 04:53:19" " 81038"
# 2018-05-02 06:38:31 "2018-05-02 06:38:31" " 81038"
# 2018-05-03 00:34:51 "2018-05-03 00:34:51" " 80137"
# 2018-05-03 06:33:24 "2018-05-03 06:33:24" " 80137"
# 2018-05-03 06:48:25 "2018-05-03 06:48:25" " 80137"
# 2018-05-03 07:06:33 "2018-05-03 07:06:33" " 80137"
# Warnmeldung:
# timezone of object (UTC) is different than current timezone ().


ep       <- endpoints(df1_xts, on = "days")
df1_xts1 <- df1_xts[ep]
# > head(df1_xts1)
#                     time                  popularity
# 2018-05-02 06:38:31 "2018-05-02 06:38:31" " 81038"
# 2018-05-03 23:42:40 "2018-05-03 23:42:40" " 83646"
# 2018-05-04 23:42:42 "2018-05-04 23:42:42" " 83240"
# 2018-05-05 23:42:41 "2018-05-05 23:42:41" " 83237"
# 2018-05-06 23:42:42 "2018-05-06 23:42:42" " 83237"
# 2018-05-07 23:42:41 "2018-05-07 23:42:41" " 81129"
# Warnmeldung:
# timezone of object (UTC) is different than current timezone ().


df1a <- data.frame(date       = as.Date(index(df1_xts1)),
                   popularity = as.numeric(df1_xts1$popularity))
# > head(df1a)
#         date popularity
# 1 2018-05-02      81038
# 2 2018-05-03      83646
# 3 2018-05-04      83240
# 4 2018-05-05      83237
# 5 2018-05-06      83237
# 6 2018-05-07      81129
#
# > str(df1a)
# 'data.frame':   435 obs. of  2 variables:
#  $ date      : Date, format: "2018-05-02" "2018-05-03" "2018-05-04" ...
#  $ popularity: num  81038 83646 83240 83237 83237 ...
# >

first_date <- df1b[1, ]$date
last_date  <- last(df1b$date)


dat1 <- tq_get(tckr1[1], get = "stock.prices", from = first_date, to = last_date)
# > dat1
# # A tibble: 304 x 7
#    date        open  high   low close   volume adjusted
#    <date>     <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#  1 2018-05-02  299.  307.  298.  301.  8970400     301.
#  2 2018-05-03  279.  288.  275.  284. 17352100     284.
#  3 2018-05-04  283   297.  280.  294.  8569400     294.
#  4 2018-05-07  298.  306.  295.  303.  8678200     303.
#  5 2018-05-08  301.  308.  299   302.  5930000     302.
#  6 2018-05-09  300.  307.  300.  307.  5727400     307.
#  7 2018-05-10  308.  313.  304.  305.  5651600     305.
#  8 2018-05-11  308.  309.  299.  301.  4679600     301.
#  9 2018-05-14  303.  305.  292.  292.  7286800     292.
# 10 2018-05-15  285.  287.  280.  284.  9519200     284.
# # ... with 294 more rows


# merge df1b (popularity) with daily close quotes:
df1b <- as_tibble(df1a)

# df1c is a data.frame:
df1c <- merge(x = df1b, y = dat1, by = "date")
# > head(df1c)
#         date popularity   open   high    low  close   volume adjusted
# 1 2018-05-02      81038 298.57 306.85 297.78 301.15  8970400   301.15
# 2 2018-05-03      83646 278.79 288.04 275.23 284.45 17352100   284.45
# 3 2018-05-04      83240 283.00 296.86 279.52 294.09  8569400   294.09
# 4 2018-05-07      81129 297.50 305.96 295.17 302.77  8678200   302.77
# 5 2018-05-08      80656 300.80 307.75 299.00 301.97  5930000   301.97
# 6 2018-05-09      79994 300.41 307.01 300.05 306.85  5727400   306.85
# >
#


df1d <- as_tibble(df1c)  # have a tibble here for plotting


######################
######################
# correlation:
x1 <- df1c$adjusted
y1 <- df1c$popularity
cor.test(x1, y1)
# > cor.test(x1, y1)
#
#         Pearson's product-moment correlation
#
# data:  x1 and y1
# t = -34.092, df = 297, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
#  -0.9134175 -0.8667673
# sample estimates:
#        cor
# -0.8924537
#
######################
######################


# plot the multi chart:
plot_title1 <- paste(security_name1, ": ", first_date, " - ", last_date, sep = "")

ylabel  <- paste("[", curr1, "]")
ylabel2 <- paste("Volume")
ylabel3 <- paste("Popularity [accounts] (red)")


# plot price:
p1 <- df1d %>%
      ggplot(aes(x = date)) +

      # plot daily prices as a line:
      geom_line(aes(y = adjusted), colour = "grey") +

      # plot daily popularity prices as a line:
      geom_line(aes(y = popularity / 400), colour = "red") +
      scale_y_continuous(sec.axis = sec_axis(~ (. * 400),
                         name = ylabel3,
                         label = comma)) +

      labs(title = plot_title1, y = ylabel) +

      theme_tq() +

      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank()
      )


# plot volume below:
p2 <- df1d %>%
      ggplot(aes(x = date, y = volume)) +

      # plot daily volume data as bars:
      geom_bar(stat = "identity") +

      labs(y = ylabel2) +
      scale_y_continuous(label = comma) +

      theme_tq() +

      theme(
        axis.title.x = element_blank()
      )


plot_grid(p1, p2, ncol = 1, align = 'v')


# end of robintrack_correlation.R