brianhaas19

GABF Gold Medal Cumulative Distribution From CSV

Nov 15th, 2020 (edited)
1,193
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ###################################################################################################
  2. # Great American Beer Festival - Cumulative distribution of gold medal winning breweries from 1987 to 2020
  3. # Created by: reddit.com/user/brianhaas19
  4.  
  5. # Link to data
  6. # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-10-20/readme.md
  7.  
  8. # Setup
  9. library(tidyverse)
  10. library(tidytuesdayR)
  11. library(scales)
  12. library(gganimate)
  13. library(transformr)
  14. theme_set(theme_bw())
  15.  
  16. # Load data
  17. ### Load clean data set of GABF gold medal winners from CSV:
  18. gold_medal_map <- read_csv("https://pastebin.com/raw/TLpKkMnG")
  19.  
  20. # Load additional data
  21. ### State ID's:
  22. state_ids <- structure(list(state = c("alabama", "alaska", "arizona", "arkansas",
  23. "california", "colorado", "connecticut", "delaware", "district of columbia",
  24. "florida", "georgia", "hawaii", "idaho", "illinois", "indiana",
  25. "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland",
  26. "massachusetts", "michigan", "minnesota", "mississippi", "missouri",
  27. "montana", "nebraska", "nevada", "new hampshire", "new jersey",
  28. "new mexico", "new york", "north carolina", "north dakota", "ohio",
  29. "oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina",
  30. "south dakota", "tennessee", "texas", "utah", "vermont", "virginia",
  31. "washington", "west virginia", "wisconsin", "wyoming"), id = c("AL",
  32. "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI",
  33. "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
  34. "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
  35. "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT",
  36. "VT", "VA", "WA", "WV", "WI", "WY")), class = c("spec_tbl_df",
  37. "tbl_df", "tbl", "data.frame"), row.names = c(NA, -51L), spec = structure(list(
  38.     cols = list(state = structure(list(), class = c("collector_character",
  39.     "collector")), id = structure(list(), class = c("collector_character",
  40.     "collector"))), default = structure(list(), class = c("collector_guess",
  41.     "collector")), skip = 1L), class = "col_spec"))
  42.  
  43. ### Get map data from the `maps` package:
  44. states <- map_data("state")
  45. names(states) <- names(states) %>% str_replace("region", "state")
  46.  
  47. # Colors and Themes
  48. colors <- c("GABF_blue" = "#005F85",
  49.             "GABF_gold" = "#C69F32")
  50.  
  51. map_theme <- theme(
  52.   axis.text = element_blank(),
  53.   axis.line = element_blank(),
  54.   axis.ticks = element_blank(),
  55.   panel.border = element_blank(),
  56.   panel.grid = element_blank(),
  57.   axis.title = element_blank(),
  58.   plot.title = element_text(color = colors["GABF_blue"]),
  59.   plot.subtitle = element_text(color = colors["GABF_blue"], size = 18),
  60.   plot.caption = element_text(color = colors["GABF_blue"], hjust = 0)
  61. )
  62.  
  63. # Wrangle data
  64. ### Create a data frame showing the cumulative number of gold medals won over time:
  65. cum_medals <- gold_medal_map %>%
  66.   group_by(state_id, year) %>%
  67.   mutate(sum_medal = sum(n)) %>%
  68.   distinct(year, state_id, sum_medal) %>%
  69.   ungroup() %>%
  70.   group_by(state_id) %>%
  71.   mutate(cum_medal = cumsum(sum_medal)) %>%
  72.   select(year, state_id, cum_medal)
  73.  
  74. ### Build a separate data frame with one row for each year from 1987 to 2020 and state.
  75. ### Even though not every state won a medal every year they will need to have a row in this table.
  76. ### Then fill in the missing years with the most recent total:
  77.  
  78. ### Build the data frame skeleton:
  79. cum_medals_df <- expand_grid(
  80.   year = 1987:2020,
  81.   state_id = filter(state_ids, !(id %in% c("AK", "HI", "DC")))$id
  82. ) %>%
  83.   arrange(state_id, year) # this step is crucial so that dplyr::fill() works later
  84.  
  85. ### Add the cumulative medal totals:
  86. cum_medals_df <- cum_medals_df %>%
  87.   left_join(cum_medals, by = c("year", "state_id"))
  88.  
  89. ### Enter a zero for any states who did not win a medal in 1987 so that we can use `fill(direction = "down")`:
  90. cum_medals_df[(cum_medals_df$year == 1987) & is.na(cum_medals_df$cum_medal), "cum_medal"] <- 0
  91.  
  92. ### Fill the missing values with the most recent total:
  93. cum_medals_df <- cum_medals_df %>%
  94.   fill(cum_medal)
  95. cum_medals_df
  96.  
  97. ### Now that the table has been filled go back and replace all zeroes with NA so that they appear as gray on the map.
  98. ### This has to be done after using `fill()`, otherwise the filling process doesn't work.
  99. ### For example it starts grabbing values from other states and years to fill the `NA`s for 1987.
  100. cum_medals_df$cum_medal[cum_medals_df$cum_medal == 0] <- NA
  101.  
  102. ### Finally, build a data frame of all years and cumulative medal totals combined with the map data:
  103. ### (large data frame, ~500k rows)
  104. ### Ref: https://stackoverflow.com/a/28506694
  105. repeat_data_frame <- function(d, n) {
  106.   if ("data.table" %in% class(d)) return(d[rep(seq_len(nrow(d)), n)])
  107.   return(d[rep(seq_len(nrow(d)), n), ])
  108. }
  109.  
  110. states_repeat <- repeat_data_frame(states, 34)
  111. years <- rep(1987:2020, each = nrow(states))
  112. states_repeat$year <- years
  113. states_cum_medals <- states_repeat %>%
  114.   left_join(state_ids, by = "state") %>%
  115.   rename(state_id = id) %>%
  116.   left_join(cum_medals_df, by = c("year", "state_id"))
  117. states_cum_medals
  118.  
  119. # Visualize
  120.  
  121. ### Static plot with all medals and states filled to 2020 levels (cumulative totals):
  122. ### Recommended parameters for R notebook: fig.height=6, fig.width=12
  123. g <- ggplot(data = filter(states_cum_medals, year == 2020)) +
  124.   geom_polygon(aes(x = long, y = lat, group = group, fill = cum_medal),
  125.                size = 0.2,
  126.                color = colors["GABF_blue"]) +
  127.   geom_jitter(data = gold_medal_map,
  128.              aes(long, lat, size = n),
  129.              color = colors["GABF_gold"],
  130.              alpha = 0.3,
  131.              show.legend = FALSE) +
  132.   scale_size_continuous(range = c(3, 8)) +
  133.   scale_fill_gradient("Total", breaks = seq(50, 250, by = 50),
  134.                       low = "white", high = colors["GABF_blue"], limits = c(0, 282)) +
  135.   guides(fill = guide_colourbar(barheight = unit(3.5, "in"), # https://github.com/tidyverse/ggplot2/pull/2541
  136.          ticks.colour = "black", frame.colour = "black")) +
  137.   labs(title = expression(paste(bold("Great American Beer Festival"), " - Cumulative distribution of gold medal winning breweries from 1987 to 2020")),
  138.        caption = expression(paste("Created by: ", italic("reddit.com/u/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-10-20/readme.md")))) +
  139.   coord_quickmap() +
  140.   map_theme
  141. g
  142.  
  143. ### ggsave(str_c(getwd(), "/GABF_cum_total.png"), plot = g, width = 12, height = 6) # uncomment to save to disk
  144.  
  145.  
  146. ### Animated plot with annual medals awarede and cumulative medal totals:
  147. num_years <- n_distinct(gold_medal_map$year)
  148. p <- ggplot() +
  149.   geom_polygon(data = states_cum_medals,
  150.                aes(x = long, y = lat, group = group, fill = cum_medal),
  151.                size = 0.2,
  152.                color = colors["GABF_blue"]) +
  153.   geom_point(data = gold_medal_map,
  154.              aes(long, lat, size = n),
  155.              color = colors["GABF_gold"],
  156.              alpha = 0.7,
  157.              show.legend = FALSE) +
  158.   ggrepel::geom_label_repel(data = filter(gold_medal_map, n > 1),
  159.                             label.size = 0,
  160.                             fill = alpha(c("white"), 0.6),
  161.                             aes(long, lat, label = label),
  162.                             color = colors["GABF_blue"],
  163.                             seed = 1) +
  164.   scale_size_continuous(range = c(3, 8)) +
  165.   scale_fill_gradient("Total", breaks = seq(50, 250, by = 50),
  166.                       low = "white", high = colors["GABF_blue"], limits = c(0, 282)) +
  167.   guides(fill = guide_colourbar(barheight = unit(3.5, "in"), # https://github.com/tidyverse/ggplot2/pull/2541
  168.          ticks.colour = "black", frame.colour = "black")) +
  169.   coord_quickmap() +
  170.   map_theme +
  171.   labs(title = expression(paste(bold("Great American Beer Festival"), " - Cumulative distribution of gold medal winning breweries from 1987 to 2020")),
  172.        subtitle = "Year: {next_state}",
  173.        caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-10-20/readme.md")))) +
  174.   transition_states(year, transition_length = 1, state_length = 1)
  175.  
  176. animate(p,
  177.         nframes = num_years * 2, duration = num_years*2,
  178.         height = 6, width = 12, units = "in", res = 150)
  179.  
  180. #anim_save(str_c(getwd(), "/GABF_cum_label.gif"), plot = p) # uncomment to save to disk
  181. ###################################################################################################
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×