DataViz Battle for the month of September 2019

# Libraries:
library(tidyverse)
library(grid)

# Read data:
data <- read.csv("https://raw.githubusercontent.com/RedditFormula1/Data/master/commentdata.dat")
names(data) <- c("timestamp", "minutes_hidden", "score_after_15", "score_after_30", "score_after_45", "score_after_1.00", "score_after_1.15", "score_after_1.30", "score_after_1.45",
                 "score_after_2.00")

# Functions:
# `get_alpha`: The `alpha` values correspond to the absolute value of the comment scores, and are scaled form 0 to 1. Comments with larger absolute scores have larger `alpha` values, and vice versa. These are used to change the size and transparency of the timelines for each comment score. This emphasizes the comments with larger absolute scores. Since there are fewer negative comments, this function places additional emphasis on them to give them prominence on the plot:
get_alpha <- function(x) {
  score <- x[1]
  divider <- x[2] # largest score in the group

  alpha <- score / divider

  # Emphasise negative scores:
  if(score < 0) {
    if(alpha > 0.5) {
      alpha <- 1
    } else if(alpha > 0.3) {
      alpha <- 0.7
    }
  }

  alpha
}

# Data processing:

## Prepare names in advance:
times <- c(15, 30, 45, 60, 75, 90, 105, 120)
names <- c(names(data)[1:2], times, "divider", "alpha_", "ID")

## Transform to long form:
data_long  <- data %>%
  group_by(minutes_hidden) %>%
  mutate(divider = ifelse(score_after_2.00 > 0,
                          max(score_after_2.00[which(score_after_2.00 > 0)]),
                          min(score_after_2.00[score_after_2.00 < 0]))) %>%
  ungroup() %>%
  mutate(alpha_ = select(., score_after_2.00:divider) %>%
                    pmap_dbl(~get_alpha(c(...)))) %>%
  mutate(ID = row_number()) %>%
  setNames(names) %>%
  gather("measuring_time", "score", 3:10) %>%
  mutate(measuring_time = as.numeric(measuring_time),
         minutes_hidden = factor(minutes_hidden, ordered = TRUE, levels = c('0', '30', '60'))) %>%
  select(ID, minutes_hidden, measuring_time, divider, score, alpha_) %>%
  arrange(ID)

## Prepare a summary of the data:
data_sum <- data_long %>%
  mutate(positive = score > 0) %>%
  group_by(measuring_time, minutes_hidden, positive) %>%
  summarise(sd =  sd(score),
            var = var(score),
            avg = mean(score)) %>%
  select(minutes_hidden, measuring_time, positive, sd, var, avg) %>%
  arrange(minutes_hidden, measuring_time, positive)

# Plot:

## Colors and labels for facets:
facet_cols <- c('0' = "#FF9400", '30' = "#FE6A00", '60' = "#FF4A00")
facet_axis_labels <- c('', '30', '', '1:00', '', '1:30', '', '2:00')

facet_names <- list('0' = "Comment Score Not Hidden",
                    '30' = "Comment Score Hidden For 30 Minutes",
                    '60' = "Comment Score Hidden For 60 Minutes")
facet_labeller <- function(variable, value) {
  return(facet_names[value])
}

## Assemble the plot:
G <- ggplot(data_long) +

  # Group the data by `minutes_hidden`:
  facet_wrap(~minutes_hidden, nrow = 1, labeller = facet_labeller) +

  # Add v-lines at the `minutes_hidden` marks (add these first so they sit under the other plot elements):
  geom_vline(data = subset(data_long, minutes_hidden == 30), aes(xintercept = 30), color = "grey70") +
  geom_vline(data = subset(data_long, minutes_hidden == 60), aes(xintercept = 60), color = "grey70") +

  # Add comment score timelines:
  geom_line(data = data_long, aes(measuring_time, score, group = ID, color = minutes_hidden, alpha = alpha_, size = alpha_), show.legend = FALSE) +

  # Add variance curves (group the positive and negative comments and plot variance of each group):
  ## Positive comment scores:
  geom_line(data = subset(data_sum, minutes_hidden == 0 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +
  geom_line(data = subset(data_sum, minutes_hidden == 30 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +
  geom_line(data = subset(data_sum, minutes_hidden == 60 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +

  ## Negative comment scores (multiply the variance by -1 for displaying on plot):
  geom_line(data = subset(data_sum, minutes_hidden == 0 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +
  geom_line(data = subset(data_sum, minutes_hidden == 30 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +
  geom_line(data = subset(data_sum, minutes_hidden == 60 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +

  ## Lightly shade in the area under the variance curves:
  geom_ribbon(data = subset(data_sum, minutes_hidden == 0 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  geom_ribbon(data = subset(data_sum, minutes_hidden == 0 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +
  geom_ribbon(data = subset(data_sum, minutes_hidden == 30 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  geom_ribbon(data = subset(data_sum, minutes_hidden == 30 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +
  geom_ribbon(data = subset(data_sum, minutes_hidden == 60 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  geom_ribbon(data = subset(data_sum, minutes_hidden == 60 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +

  # Add a subtle x-axis:
  geom_hline(aes(yintercept = 0), alpha = 0.5, size = 0.5, color = "#FFCDB2") +

  # Add horizontal dashed lines to each plot to indicate the max variances:
  geom_hline(data = subset(data_sum), # positive
             aes(yintercept = unlist(data_sum[data_sum$minutes_hidden == 0 & data_sum$measuring_time == 120 & data_sum$positive == 1, "var"])),
             size = 0.6, color = "#45041A", linetype = "dotted") +
  geom_hline(data = subset(data_sum), # negative
             aes(yintercept = (-1)*unlist(data_sum[data_sum$minutes_hidden == 0 & data_sum$measuring_time == 120 & data_sum$positive == 0, "var"])),
             size = 0.6, color = "#45041A", linetype = "dotted") +

  # Add 'Total Variance' label and arrows:
  geom_text(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, y = 20,  label = "Total Variance"), hjust = 0.5, color = "#45041A") +
  geom_segment(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, xend = 105, y = 30, yend = 72), color = "#45041A",
               arrow = arrow(length = unit(0.1, "inches"), type = "closed")) +
  geom_segment(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, xend = 105, y = 10, yend = -36), color = "#45041A",
               arrow = arrow(length = unit(0.1, "inches"), type = "closed")) +

  # Add purple shaded area to indicate the `minutes_hidden` part of plot:
  geom_rect(data = subset(data_long, minutes_hidden == 30)[1, ], aes(xmin = 15, xmax = 30, ymin = -Inf, ymax = Inf), fill = "#45041A", alpha = 0.3) +
  geom_rect(data = subset(data_long, minutes_hidden == 60)[1, ], aes(xmin = 15, xmax = 60, ymin = -Inf, ymax = Inf), fill = "#45041A", alpha = 0.3) +

  # Scales:
  scale_size("line", range = c(0.05, 1)) +
  scale_x_continuous(expand = c(0, 0), limits = c(15, 120), breaks = times, labels = facet_axis_labels) +
  scale_y_continuous(limits = c(-180, 450), expand = c(0, 0), breaks = c(-100, 0, 100, 200, 300, 400)) +
  scale_color_manual(values = facet_cols) +

  # Themes and labels:
  theme_bw() +
  labs(title = "r/dataisbeautiful DataViz Battle for the month of September 2019",
       subtitle = "Visualize the effect of hiding comment scores in /r/formula1",
       x = "Measurement Time", y = "Comment Score",
       caption = "u/brianhaas19") +
  theme(plot.margin = unit(c(5.5, 9, 5.5, 5.5), "points"),
        panel.background = element_rect(fill = "white"),
        plot.title = element_text(color = "#45041A"),
        plot.subtitle = element_text(color = "#45041A"),
        plot.caption = element_text(color = "#45041A", size = 10),
        axis.text = element_text(color = "#45041A"),
        axis.title = element_text(size = 14, color = "#45041A"),
        panel.spacing.x = unit(3, "mm"),
        strip.text = element_text(face = "bold", size = 14, colour = "white"), # alt: #FFCDB2
        strip.text.x = element_text(margin = unit(c(10, 0, 10, 0), "points")))

# Make final alterations to the plot using `grid::grid.force`:

## Generate the ggplot2 plot grob (ref: https://stackoverflow.com/a/53457599 )
g <- grid.force(ggplotGrob(G))

## Get the names of grobs and their gPaths into a data.frame structure:
grobs_df <- do.call(cbind.data.frame, grid.ls(g, print = FALSE))

## Build optimal gPaths that will be later used to identify grobs and edit them:
grobs_df$gPath_full <- paste(grobs_df$gPath, grobs_df$name, sep = "::")
grobs_df$gPath_full <- gsub(pattern = "layout::",
                            replacement = "",
                            x = grobs_df$gPath_full,
                            fixed = TRUE)

## Get the gPaths of the strip background grobs:
strip_bg_gpath <- grobs_df$gPath_full[grepl(pattern = ".*strip\\.background.*",
                                            x = grobs_df$gPath_full)]
g <- editGrob(grob = g, gPath = strip_bg_gpath[1], gp = gpar(fill = facet_cols[1]))
g <- editGrob(grob = g, gPath = strip_bg_gpath[2], gp = gpar(fill = facet_cols[2]))
g <- editGrob(grob = g, gPath = strip_bg_gpath[3], gp = gpar(fill = facet_cols[3]))

## Generate the plot:
grid.newpage(); grid.draw(g)
#ggsave(file = str_c(getwd(), "/sept.png"), plot = g, width = 15, height = 7.5) # uncomment to save to disk in current working directory