Advertisement
brianhaas19

DataViz Battle for the month of September 2019

Oct 9th, 2019
102
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 9.74 KB | None | 0 0
  1. # Libraries:
  2. library(tidyverse)
  3. library(grid)
  4.  
  5. # Read data:
  6. data <- read.csv("https://raw.githubusercontent.com/RedditFormula1/Data/master/commentdata.dat")
  7. names(data) <- c("timestamp", "minutes_hidden", "score_after_15", "score_after_30", "score_after_45", "score_after_1.00", "score_after_1.15", "score_after_1.30", "score_after_1.45",
  8.                  "score_after_2.00")
  9.  
  10. # Functions:
  11. # `get_alpha`: The `alpha` values correspond to the absolute value of the comment scores, and are scaled form 0 to 1. Comments with larger absolute scores have larger `alpha` values, and vice versa. These are used to change the size and transparency of the timelines for each comment score. This emphasizes the comments with larger absolute scores. Since there are fewer negative comments, this function places additional emphasis on them to give them prominence on the plot:
  12. get_alpha <- function(x) {
  13.   score <- x[1]
  14.   divider <- x[2] # largest score in the group
  15.  
  16.   alpha <- score / divider
  17.  
  18.   # Emphasise negative scores:
  19.   if(score < 0) {
  20.     if(alpha > 0.5) {
  21.       alpha <- 1
  22.     } else if(alpha > 0.3) {
  23.       alpha <- 0.7
  24.     }
  25.   }
  26.  
  27.   alpha
  28. }
  29.  
  30. # Data processing:
  31.  
  32. ## Prepare names in advance:
  33. times <- c(15, 30, 45, 60, 75, 90, 105, 120)
  34. names <- c(names(data)[1:2], times, "divider", "alpha_", "ID")
  35.  
  36. ## Transform to long form:
  37. data_long  <- data %>%
  38.   group_by(minutes_hidden) %>%
  39.   mutate(divider = ifelse(score_after_2.00 > 0,
  40.                           max(score_after_2.00[which(score_after_2.00 > 0)]),
  41.                           min(score_after_2.00[score_after_2.00 < 0]))) %>%
  42.   ungroup() %>%
  43.   mutate(alpha_ = select(., score_after_2.00:divider) %>%
  44.                     pmap_dbl(~get_alpha(c(...)))) %>%
  45.   mutate(ID = row_number()) %>%
  46.   setNames(names) %>%
  47.   gather("measuring_time", "score", 3:10) %>%
  48.   mutate(measuring_time = as.numeric(measuring_time),
  49.          minutes_hidden = factor(minutes_hidden, ordered = TRUE, levels = c('0', '30', '60'))) %>%
  50.   select(ID, minutes_hidden, measuring_time, divider, score, alpha_) %>%
  51.   arrange(ID)
  52.  
  53. ## Prepare a summary of the data:
  54. data_sum <- data_long %>%
  55.   mutate(positive = score > 0) %>%
  56.   group_by(measuring_time, minutes_hidden, positive) %>%
  57.   summarise(sd =  sd(score),
  58.             var = var(score),
  59.             avg = mean(score)) %>%
  60.   select(minutes_hidden, measuring_time, positive, sd, var, avg) %>%
  61.   arrange(minutes_hidden, measuring_time, positive)
  62.  
  63. # Plot:
  64.  
  65. ## Colors and labels for facets:
  66. facet_cols <- c('0' = "#FF9400", '30' = "#FE6A00", '60' = "#FF4A00")
  67. facet_axis_labels <- c('', '30', '', '1:00', '', '1:30', '', '2:00')
  68.  
  69. facet_names <- list('0' = "Comment Score Not Hidden",
  70.                     '30' = "Comment Score Hidden For 30 Minutes",
  71.                     '60' = "Comment Score Hidden For 60 Minutes")
  72. facet_labeller <- function(variable, value) {
  73.   return(facet_names[value])
  74. }
  75.  
  76. ## Assemble the plot:
  77. G <- ggplot(data_long) +
  78.  
  79.   # Group the data by `minutes_hidden`:
  80.   facet_wrap(~minutes_hidden, nrow = 1, labeller = facet_labeller) +
  81.  
  82.   # Add v-lines at the `minutes_hidden` marks (add these first so they sit under the other plot elements):
  83.   geom_vline(data = subset(data_long, minutes_hidden == 30), aes(xintercept = 30), color = "grey70") +
  84.   geom_vline(data = subset(data_long, minutes_hidden == 60), aes(xintercept = 60), color = "grey70") +
  85.  
  86.   # Add comment score timelines:
  87.   geom_line(data = data_long, aes(measuring_time, score, group = ID, color = minutes_hidden, alpha = alpha_, size = alpha_), show.legend = FALSE) +
  88.  
  89.   # Add variance curves (group the positive and negative comments and plot variance of each group):
  90.   ## Positive comment scores:
  91.   geom_line(data = subset(data_sum, minutes_hidden == 0 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +
  92.   geom_line(data = subset(data_sum, minutes_hidden == 30 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +
  93.   geom_line(data = subset(data_sum, minutes_hidden == 60 & positive == 1), aes(measuring_time, var), size = 0.5, color = "#45041A") +
  94.  
  95.   ## Negative comment scores (multiply the variance by -1 for displaying on plot):
  96.   geom_line(data = subset(data_sum, minutes_hidden == 0 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +
  97.   geom_line(data = subset(data_sum, minutes_hidden == 30 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +
  98.   geom_line(data = subset(data_sum, minutes_hidden == 60 & positive == 0), aes(measuring_time, (-1)*var), size = 0.5, color = "#45041A") +
  99.  
  100.   ## Lightly shade in the area under the variance curves:
  101.   geom_ribbon(data = subset(data_sum, minutes_hidden == 0 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  102.   geom_ribbon(data = subset(data_sum, minutes_hidden == 0 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +
  103.   geom_ribbon(data = subset(data_sum, minutes_hidden == 30 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  104.   geom_ribbon(data = subset(data_sum, minutes_hidden == 30 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +
  105.   geom_ribbon(data = subset(data_sum, minutes_hidden == 60 & positive == 1), aes(measuring_time, ymax = var, ymin = 0), fill = "#FFCDB2", alpha = 0.3) +
  106.   geom_ribbon(data = subset(data_sum, minutes_hidden == 60 & positive == 0), aes(measuring_time, ymax = 0, ymin = (-1)*var), fill = "#FFCDB2", alpha = 0.3) +
  107.  
  108.   # Add a subtle x-axis:
  109.   geom_hline(aes(yintercept = 0), alpha = 0.5, size = 0.5, color = "#FFCDB2") +
  110.  
  111.   # Add horizontal dashed lines to each plot to indicate the max variances:
  112.   geom_hline(data = subset(data_sum), # positive
  113.              aes(yintercept = unlist(data_sum[data_sum$minutes_hidden == 0 & data_sum$measuring_time == 120 & data_sum$positive == 1, "var"])),
  114.              size = 0.6, color = "#45041A", linetype = "dotted") +
  115.   geom_hline(data = subset(data_sum), # negative
  116.              aes(yintercept = (-1)*unlist(data_sum[data_sum$minutes_hidden == 0 & data_sum$measuring_time == 120 & data_sum$positive == 0, "var"])),
  117.              size = 0.6, color = "#45041A", linetype = "dotted") +
  118.  
  119.   # Add 'Total Variance' label and arrows:
  120.   geom_text(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, y = 20,  label = "Total Variance"), hjust = 0.5, color = "#45041A") +
  121.   geom_segment(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, xend = 105, y = 30, yend = 72), color = "#45041A",
  122.                arrow = arrow(length = unit(0.1, "inches"), type = "closed")) +
  123.   geom_segment(data = subset(data_sum, minutes_hidden == 0)[1, ], aes(x = 105, xend = 105, y = 10, yend = -36), color = "#45041A",
  124.                arrow = arrow(length = unit(0.1, "inches"), type = "closed")) +
  125.  
  126.   # Add purple shaded area to indicate the `minutes_hidden` part of plot:
  127.   geom_rect(data = subset(data_long, minutes_hidden == 30)[1, ], aes(xmin = 15, xmax = 30, ymin = -Inf, ymax = Inf), fill = "#45041A", alpha = 0.3) +
  128.   geom_rect(data = subset(data_long, minutes_hidden == 60)[1, ], aes(xmin = 15, xmax = 60, ymin = -Inf, ymax = Inf), fill = "#45041A", alpha = 0.3) +
  129.  
  130.   # Scales:
  131.   scale_size("line", range = c(0.05, 1)) +
  132.   scale_x_continuous(expand = c(0, 0), limits = c(15, 120), breaks = times, labels = facet_axis_labels) +
  133.   scale_y_continuous(limits = c(-180, 450), expand = c(0, 0), breaks = c(-100, 0, 100, 200, 300, 400)) +
  134.   scale_color_manual(values = facet_cols) +
  135.  
  136.   # Themes and labels:
  137.   theme_bw() +
  138.   labs(title = "r/dataisbeautiful DataViz Battle for the month of September 2019",
  139.        subtitle = "Visualize the effect of hiding comment scores in /r/formula1",
  140.        x = "Measurement Time", y = "Comment Score",
  141.        caption = "u/brianhaas19") +
  142.   theme(plot.margin = unit(c(5.5, 9, 5.5, 5.5), "points"),
  143.         panel.background = element_rect(fill = "white"),
  144.         plot.title = element_text(color = "#45041A"),
  145.         plot.subtitle = element_text(color = "#45041A"),
  146.         plot.caption = element_text(color = "#45041A", size = 10),
  147.         axis.text = element_text(color = "#45041A"),
  148.         axis.title = element_text(size = 14, color = "#45041A"),
  149.         panel.spacing.x = unit(3, "mm"),
  150.         strip.text = element_text(face = "bold", size = 14, colour = "white"), # alt: #FFCDB2
  151.         strip.text.x = element_text(margin = unit(c(10, 0, 10, 0), "points")))
  152.  
  153. # Make final alterations to the plot using `grid::grid.force`:
  154.  
  155. ## Generate the ggplot2 plot grob (ref: https://stackoverflow.com/a/53457599 )
  156. g <- grid.force(ggplotGrob(G))
  157.  
  158. ## Get the names of grobs and their gPaths into a data.frame structure:
  159. grobs_df <- do.call(cbind.data.frame, grid.ls(g, print = FALSE))
  160.  
  161. ## Build optimal gPaths that will be later used to identify grobs and edit them:
  162. grobs_df$gPath_full <- paste(grobs_df$gPath, grobs_df$name, sep = "::")
  163. grobs_df$gPath_full <- gsub(pattern = "layout::",
  164.                             replacement = "",
  165.                             x = grobs_df$gPath_full,
  166.                             fixed = TRUE)
  167.  
  168. ## Get the gPaths of the strip background grobs:
  169. strip_bg_gpath <- grobs_df$gPath_full[grepl(pattern = ".*strip\\.background.*",
  170.                                             x = grobs_df$gPath_full)]
  171. g <- editGrob(grob = g, gPath = strip_bg_gpath[1], gp = gpar(fill = facet_cols[1]))
  172. g <- editGrob(grob = g, gPath = strip_bg_gpath[2], gp = gpar(fill = facet_cols[2]))
  173. g <- editGrob(grob = g, gPath = strip_bg_gpath[3], gp = gpar(fill = facet_cols[3]))
  174.  
  175. ## Generate the plot:
  176. grid.newpage(); grid.draw(g)
  177. #ggsave(file = str_c(getwd(), "/sept.png"), plot = g, width = 15, height = 7.5) # uncomment to save to disk in current working directory
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement