Advertisement
Guest User

Untitled

a guest
Jun 6th, 2024
190
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 23.85 KB | None | 0 0
  1. ### --- --- --- --- --- --- --- --- --- --- ---
  2. #### load libraries ####
  3. ### --- --- --- --- --- --- --- --- --- --- ---
  4.  
  5. library(httr)
  6. library(jsonlite)
  7. library(rvest)
  8. library(glmnet)
  9.  
  10. ### --- --- --- --- --- --- --- --- --- --- ---
  11. #### pull shows, songs, setlists data from API ####
  12. ### --- --- --- --- --- --- --- --- --- --- ---
  13.  
  14. # Set API Key and base URL
  15. api_key <- "XXXXXXXXX"
  16. base_url <- "https://api.phish.net/v5"
  17.  
  18. ## Shows ##
  19.  
  20. url <- 'https://api.phish.net/v5/shows/artist/phish.json?order_by=showdate&apikey=XXXXXXXXX'
  21. response <- GET(url)
  22. shows_data <- fromJSON(content(response, "text"), flatten = TRUE)
  23. shows_data <- shows_data$data
  24.  
  25. ## Songs ##
  26.  
  27. url <- 'https://api.phish.net/v5/songs.json?apikey=XXXXXXXXX'
  28. response <- GET(url)
  29. songs_data <- fromJSON(content(response, "text"), flatten = TRUE)
  30. songs_data <- songs_data$data
  31.  
  32. ## Setlists ##
  33.  
  34. #url <- 'https://api.phish.net/v5/setlists/showdate/1997-11-22.json?apikey=XXXXXXXXX'
  35. url <- 'https://api.phish.net/v5/setlists.json?apikey=XXXXXXXXX'
  36. response <- GET(url)
  37. setlists_data <- fromJSON(content(response, "text"), flatten = TRUE)
  38. setlists_data <- setlists_data$data
  39.  
  40. ### --- --- --- --- --- --- --- --- --- --- ---
  41. #### scrape ratings data from web ####
  42. ### --- --- --- --- --- --- --- --- --- --- ---
  43.  
  44. ratings_df <- data.frame()
  45.  
  46. for(i in 1984:2023) {
  47.  
  48.   url <- paste0("https://phish.net/music/ratings/", i)
  49.   page <- read_html(url)
  50.  
  51.   ratings <- page %>%
  52.     html_nodes('td') %>%
  53.     html_text()
  54.  
  55.   ratings <- matrix(ratings, ncol = 6, byrow = TRUE)
  56.   ratings <- as.data.frame(ratings, stringsAsFactors = FALSE)
  57.  
  58.   show_links <- page %>%
  59.     html_nodes(".sortable a") %>%
  60.     html_attr("href")
  61.  
  62.   ratings <- bind_cols(ratings, show_links)
  63.  
  64.   colnames(ratings) <- c('rating', 'show_date', 'venue', 'city', 'state', 'country', 'link')
  65.  
  66.   ratings_df <- bind_rows(ratings_df, ratings)
  67.  
  68. }
  69.  
  70. ### --- --- --- --- --- --- --- --- --- --- ---
  71. #### join ratings and setlist data ####
  72. ### --- --- --- --- --- --- --- --- --- --- ---
  73.  
  74. combined_df <- ratings_df %>%
  75.   mutate(link = paste0('https://phish.net', link)) %>%
  76.  
  77.   inner_join(y = shows_data %>%
  78.                filter(exclude_from_stats == 0, artistid == 1) %>%
  79.                select(showid, permalink),
  80.              by = c('link' = 'permalink')) %>%
  81.  
  82.   inner_join(y = setlists_data %>%
  83.                filter(artistid == 1) %>%
  84.                select(showid, song),
  85.              by = 'showid') %>%
  86.  
  87.   mutate(
  88.     rating = as.numeric(rating),
  89.     yr = lubridate::year(show_date))
  90.  
  91. ## identify songs with at least 5 appearances ##
  92.  
  93. eligible_songs <- combined_df %>%
  94.   group_by(song) %>%
  95.   summarise(
  96.     n_shows = n_distinct(showid),
  97.     avg_rating = mean(rating, na.rm = TRUE))
  98. #  filter(n_shows >= 25)
  99.  
  100. #combined_df <- combined_df %>%
  101. #  filter(song %in% eligible_songs$song)
  102.  
  103. ### --- --- --- --- --- --- --- --- --- --- ---
  104. #### fit vanilla model ####
  105. ### --- --- --- --- --- --- --- --- --- --- ---
  106.  
  107. ## spread ##
  108.  
  109. combined_df_spread_vanilla <- combined_df %>%
  110.   select(showid, show_date, song, rating) %>%
  111.   distinct() %>%
  112.   mutate(val = 1) %>%
  113.   spread(key = song, value = val, fill = 0) %>%
  114.   dplyr::select(-showid, -show_date)
  115.  
  116. ## fit model ##
  117.  
  118. x_vanilla <- as.matrix(combined_df_spread_vanilla[, -1])
  119. y_vanilla <- combined_df_spread_vanilla$rating
  120.  
  121. lambda_values_vanilla <- 10^seq(3, -2, length = 500)
  122. ridge_model_vanilla <- glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE)
  123. cv_ridge_vanilla <- cv.glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE, nfolds = 250)
  124. best_lambda_vanilla <- cv_ridge_vanilla$lambda.min
  125. coefficients_vanilla <- coef(ridge_model_vanilla, s = best_lambda_vanilla)
  126.  
  127. coef_vector_vanilla <- as.vector(coefficients_vanilla)
  128. feature_names_vanilla <- c("Intercept", colnames(combined_df_spread_vanilla)[-1])
  129.  
  130. coef_df_vanilla <- data.frame(
  131.   Feature = feature_names_vanilla,
  132.   Coefficient = coef_vector_vanilla,
  133.   stringsAsFactors = FALSE
  134. )
  135.  
  136. coef_df_vanilla <- coef_df_vanilla %>% left_join(eligible_songs, by = c('Feature' = 'song'))
  137.  
  138. View(coef_df_vanilla)
  139.  
  140. ### --- --- --- --- --- --- --- --- --- --- ---
  141. #### [plot] top/bottom songs: vanilla model ####
  142. ### --- --- --- --- --- --- --- --- --- --- ---
  143.  
  144. top_20_songs_vanilla_plot <- coef_df_vanilla %>%
  145.   filter(Feature != 'Intercept') %>%
  146.   top_n(20, Coefficient) %>%
  147.   ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  148.   coord_flip() +
  149.   geom_col(fill = 'dodgerblue') +
  150.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
  151.             size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  152.   theme_rs2 +
  153.   labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Vanilla Model')
  154.  
  155. ggsave('top_20_songs_vanilla_plot.png', top_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)
  156.  
  157. bottom_20_songs_vanilla_plot <- coef_df_vanilla %>%
  158.   filter(Feature != 'Intercept') %>%
  159.   top_n(20, desc(Coefficient)) %>%
  160.   ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  161.   coord_flip() +
  162.   geom_col(fill = 'red') +
  163.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
  164.             size = 2.75, fontface = 'bold', color = 'red') +
  165.   theme_rs2 +
  166.   labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Vanilla Model')
  167.  
  168. ggsave('bottom_20_songs_vanilla_plot.png', bottom_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)
  169.  
  170. ### --- --- --- --- --- --- --- --- --- --- ---
  171. #### fit year-adjusted model ####
  172. ### --- --- --- --- --- --- --- --- --- --- ---
  173.  
  174. ## spread ##
  175.  
  176. combined_df_spread_yr_adj <- combined_df %>%
  177.   select(showid, show_date, song, rating, yr) %>%
  178.   distinct() %>%
  179.   mutate(val = 1) %>%
  180.   spread(key = song, value = val, fill = 0) %>%
  181.   mutate(val = 1) %>%
  182.   spread(key = yr, value = val, fill = 0) %>%
  183.   dplyr::select(-showid, -show_date)
  184.  
  185. ## fit model ##
  186.  
  187. x_yr_adj <- as.matrix(combined_df_spread_yr_adj[, -1])
  188. y_yr_adj <- combined_df_spread_yr_adj$rating
  189.  
  190. lambda_values_yr_adj <- 10^seq(3, -2, length = 500)
  191. ridge_model_yr_adj <- glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE)
  192. cv_ridge_yr_adj <- cv.glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE, nfolds = 250)
  193. best_lambda_yr_adj <- cv_ridge_yr_adj$lambda.min
  194. coefficients_yr_adj <- coef(ridge_model_yr_adj, s = best_lambda_yr_adj)
  195.  
  196. coef_vector_yr_adj <- as.vector(coefficients_yr_adj)
  197. feature_names_yr_adj <- c("Intercept", colnames(combined_df_spread_yr_adj)[-1])
  198.  
  199. coef_df_yr_adj <- data.frame(
  200.   Feature = feature_names_yr_adj,
  201.   Coefficient = coef_vector_yr_adj,
  202.   stringsAsFactors = FALSE
  203. )
  204.  
  205. coef_df_yr_adj <- coef_df_yr_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))
  206.  
  207. View(coef_df_yr_adj)
  208.  
  209. ### --- --- --- --- --- --- --- --- --- --- ---
  210. #### [plot] top/bottom songs: year-adjusted model ####
  211. ### --- --- --- --- --- --- --- --- --- --- ---
  212.  
  213. top_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
  214.   filter(Feature != 'Intercept', n_shows > 0) %>%
  215.   top_n(20, Coefficient) %>%
  216.   ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  217.   coord_flip() +
  218.   geom_col(fill = 'dodgerblue') +
  219.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
  220.             size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  221.   theme_rs2 +
  222.   labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Year-Adjusted Model')
  223.  
  224. ggsave('top_20_songs_yr_adj_plot.png', top_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)
  225.  
  226.  
  227. bottom_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
  228.   filter(Feature != 'Intercept', n_shows > 0) %>%
  229.   top_n(20, desc(Coefficient)) %>%
  230.   ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  231.   coord_flip() +
  232.   geom_col(fill = 'red') +
  233.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
  234.             size = 2.75, fontface = 'bold', color = 'red') +
  235.   theme_rs2 +
  236.   labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Year-Adjusted Model')
  237.  
  238. ggsave('bottom_20_songs_yr_adj_plot.png', bottom_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)
  239.  
  240.  
  241. ### --- --- --- --- --- --- --- --- --- --- ---
  242. #### [plot] year contributions: year-adjusted model ####
  243. ### --- --- --- --- --- --- --- --- --- --- ---
  244.  
  245. yr_adj_plot_df <- coef_df_yr_adj %>%
  246.   filter(Feature != 'Intercept', is.na(n_shows)) %>%
  247.   mutate(Feature = as.numeric(Feature)) %>%
  248.   filter(Feature >= 1983, Feature <= 2023)
  249.  
  250. yr_adj_plot <- yr_adj_plot_df %>%
  251.   ggplot(aes(x = Feature, y = Coefficient)) +
  252.   geom_col(fill = '#1DB954') +
  253.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01), vjust = ifelse(Coefficient >= 0, -0.5, 1.5)),
  254.             size = 2.05, fontface = 'bold', color = '#1DB954') +
  255.   theme_rs3 +
  256.   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  257.   scale_x_continuous(breaks = seq(1984, 2023, 1)) +
  258.   labs(title = 'Phish Setlist Rating Plus-Minus: Years', subtitle = 'Year-Adjusted Model', x = 'Show Year', y = 'Year RAPM')
  259.  
  260. ggsave('yr_adj_plot.png', yr_adj_plot, dpi = 700, height = 4.5, width = 8.5)
  261.  
  262.  
  263. ### --- --- --- --- --- --- --- --- --- --- ---
  264. #### fit tour-adjusted model ####
  265. ### --- --- --- --- --- --- --- --- --- --- ---
  266.  
  267. ## eligible tours ##
  268.  
  269. eligible_tours <- shows_data %>%
  270.   dplyr::select(showid, tour_name) %>%
  271.   group_by(tour_name) %>%
  272.   summarise(n_shows = n_distinct(showid)) %>%
  273.   filter(n_shows >= 6)
  274.  
  275. ## spread ##
  276.  
  277. combined_df_spread_tour_adj <- combined_df %>%
  278.   select(showid, show_date, song, rating) %>%
  279.   left_join(
  280.     y = shows_data %>% dplyr::select(showid, tour_name),
  281.     by = 'showid'
  282.   ) %>%
  283.   distinct() %>%
  284.   filter(tour_name %in% eligible_tours$tour_name) %>%
  285.   mutate(val = 1) %>%
  286.   spread(key = song, value = val, fill = 0) %>%
  287.   mutate(val = 1) %>%
  288.   spread(key = tour_name, value = val, fill = 0) %>%
  289.   dplyr::select(-showid, -show_date)
  290.  
  291. ## fit model ##
  292.  
  293. x_tour_adj <- as.matrix(combined_df_spread_tour_adj[, -1])
  294. y_tour_adj <- combined_df_spread_tour_adj$rating
  295.  
  296. lambda_values_tour_adj <- 10^seq(3, -2, length = 500)
  297. ridge_model_tour_adj <- glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE)
  298. cv_ridge_tour_adj <- cv.glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE, nfolds = 250)
  299. best_lambda_tour_adj <- cv_ridge_tour_adj$lambda.min
  300. coefficients_tour_adj <- coef(ridge_model_tour_adj, s = best_lambda_tour_adj)
  301.  
  302. coef_vector_tour_adj <- as.vector(coefficients_tour_adj)
  303. feature_names_tour_adj <- c("Intercept", colnames(combined_df_spread_tour_adj)[-1])
  304.  
  305. coef_df_tour_adj <- data.frame(
  306.   Feature = feature_names_tour_adj,
  307.   Coefficient = coef_vector_tour_adj,
  308.   stringsAsFactors = FALSE
  309. )
  310.  
  311. coef_df_tour_adj <- coef_df_tour_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))
  312.  
  313. View(coef_df_tour_adj)
  314.  
  315. ### --- --- --- --- --- --- --- --- --- --- ---
  316. #### [plot] top/bottom tours: tour-adjusted model ####
  317. ### --- --- --- --- --- --- --- --- --- --- ---
  318.  
  319. top_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
  320.   filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
  321.   top_n(20, Coefficient) %>%
  322.   ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  323.   coord_flip() +
  324.   geom_col(fill = 'dodgerblue') +
  325.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.0075,
  326.             size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  327.   theme_rs2 +
  328.   labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Top 20 Most Impactful Tours | Tour-Adjusted Model')
  329.  
  330. ggsave('top_20_tours_tour_adj_plot.png', top_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)
  331.  
  332.  
  333. bottom_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
  334.   filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
  335.   top_n(20, desc(Coefficient)) %>%
  336.   ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  337.   coord_flip() +
  338.   geom_col(fill = 'red') +
  339.   geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0095,
  340.             size = 2.75, fontface = 'bold', color = 'red') +
  341.   theme_rs2 +
  342.   labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Bottom 20 Most Impactful Tours | Tour-Adjusted Model')
  343.  
  344. ggsave('bottom_20_tours_tour_adj_plot.png', bottom_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)
  345.  
  346.  
  347. ### --- --- --- --- --- --- --- --- --- --- ---
  348. #### [plot] ratings for most played songs: tour-adjusted model ####
  349. ### --- --- --- --- --- --- --- --- --- --- ---
  350.  
  351. most_played_tour_adj_plot <- coef_df_tour_adj %>%
  352.   filter(Feature != 'Intercept') %>%
  353.   top_n(40, n_shows) %>%
  354.   dplyr::select(-avg_rating) %>%
  355.   gather(metric, value, -Feature) %>%
  356.   mutate(metric = ifelse(metric == 'Coefficient', 'Plus-Minus', 'Number of Shows')) %>%
  357.   ggplot(aes(x = reorder(Feature, value, max), y = value, fill = metric)) +
  358.   facet_wrap(~metric, scales = 'free_x') +
  359.   coord_flip() +
  360.   geom_col() +
  361.   geom_text(aes(
  362.     color = metric,
  363.     label = ifelse(value > 1, scales::comma(value, accuracy = 1), scales::comma(value, accuracy = .01)),
  364.     hjust = ifelse(value > 0, -0.1, 1.1)
  365.     ),
  366.     size = 2.75, fontface = 'bold') +
  367.   scale_y_continuous(expand = expansion(mult = c(.1, .1))) +
  368.   scale_color_manual(values = c('#ff8d1e', 'dodgerblue')) +
  369.   scale_fill_manual(values = c('#ff8d1e', 'dodgerblue')) +
  370.   theme_rs3 +
  371.   theme(legend.position = 'none') +
  372.   labs(y = '', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 40 Most Played Songs | Tour-Adjusted Model')
  373.  
  374. ggsave('most_played_tour_adj_plot.png', most_played_tour_adj_plot, dpi = 700, height = 7.5, width = 8)
  375.  
  376.  
  377. ### --- --- --- --- --- --- --- --- --- --- ---
  378. #### comparing models ####
  379. ### --- --- --- --- --- --- --- --- --- --- ---
  380.  
  381. coef_df_vanilla %>%
  382.   dplyr::select(Feature, n_shows, vanilla = Coefficient) %>%
  383.   inner_join(y = coef_df_yr_adj %>% dplyr::select(Feature, yr_adjusted = Coefficient), by = 'Feature') %>%
  384.   mutate(abs_diff = abs(vanilla-yr_adjusted))
  385.  
  386. combined_df %>% filter(song == 'Fluffhead') %>% group_by(yr) %>% summarise(n_shows = n_distinct(showid), avg_rating = mean(rating, na.rm = TRUE))
  387.  
  388. ### --- --- --- --- --- --- --- --- --- --- ---
  389. #### comparing to average rating ####
  390. ### --- --- --- --- --- --- --- --- --- --- ---
  391.  
  392. coef_df_tour_adj %>%
  393.   filter(n_shows > 0) %>%
  394.   mutate(pctile_coef = percent_rank(Coefficient), pctile_avg_rating = percent_rank(avg_rating)) %>%
  395.   mutate(pctile_diff = abs(pctile_coef - pctile_avg_rating))
  396.  
  397. cor(x = coef_df_tour_adj$Coefficient, y = coef_df_tour_adj$avg_rating, use = 'complete.obs')^2
  398.  
  399. avg_rating_vs_plus_minus_plot <- coef_df_tour_adj %>%
  400.   filter(Feature != 'Intercept', n_shows > 25) %>%
  401.   mutate(lab = ifelse(avg_rating < 3.6 | avg_rating > 4.1 | Coefficient > 0.08 | Coefficient < -0.08, Feature, NA)) %>%
  402.   ggplot(aes(x = avg_rating, y = Coefficient)) +
  403.   geom_point(color = 'dodgerblue') +
  404.   geom_text_repel(aes(label = lab), size = 3) +
  405.   theme_rs3 +
  406.   scale_y_continuous(limits = c(-0.2, 0.2)) +
  407.   labs(x = 'Average Rating For Shows Where This Song Is Played', y = 'Plus-Minus Contribution',
  408.        caption = 'Songs with 25+ Appearances',
  409.        title = 'Average Rating vs. Plus-Minus Contribution')
  410.  
  411. ggsave('avg_rating_vs_plus_minus_plot.png', avg_rating_vs_plus_minus_plot, dpi = 700, height = 6, width = 7)
  412.  
  413. fluffhead_yrs <- combined_df %>%
  414.   select(showid, show_date, song, rating, yr) %>%
  415.   distinct() %>%
  416.   mutate(val = 1) %>%
  417.   spread(key = song, value = val, fill = 0) %>%
  418.   dplyr::select(-showid, -show_date) %>%
  419.   mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
  420.   group_by(yr, fluffhead_played) %>%
  421.   summarise(avg_rating = mean(rating, na.rm = TRUE), n_shows = n()) %>%
  422.   dplyr::select(yr, fluffhead_played, n_shows) %>%
  423.   spread(key = fluffhead_played, value = n_shows) %>%
  424.   filter(No > 3, Yes > 3)
  425.  
  426.  
  427. fluffhead_plot <- combined_df %>%
  428.   select(showid, show_date, song, rating, yr) %>%
  429.   distinct() %>%
  430.   mutate(val = 1) %>%
  431.   spread(key = song, value = val, fill = 0) %>%
  432.   dplyr::select(-showid, -show_date) %>%
  433.   mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
  434.   group_by(yr, fluffhead_played) %>%
  435.   summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
  436.   filter(yr %in% fluffhead_yrs$yr) %>%
  437.   ggplot(aes(x = as.factor(yr), y = avg_rating, color = as.factor(fluffhead_played))) +
  438.   coord_flip() +
  439.   geom_col(position = 'dodge', width = 0.8) +
  440.   scale_fill_manual(values = c('red', 'dodgerblue'), name = 'Was Fluffhead Played?') +
  441.   theme_rs2 +
  442.   theme(legend.position = 'bottom') +
  443.   labs(x = '', y = 'Average Rating',
  444.        subtitle = 'Years with 3+ Fluffhead Appearances and Non-Appearances',
  445.        title = 'Average Show Rating by Year: With or Without Fluffhead')
  446.  
  447. ggsave('fluffhead_plot.png', fluffhead_plot, dpi = 700, height = 6, width = 7.5)
  448.  
  449.  
  450.  
  451. ### --- --- --- --- --- --- --- --- --- --- ---
  452. #### test predictiveness (vanilla) ####
  453. ### --- --- --- --- --- --- --- --- --- --- ---
  454.  
  455. ## spread ##
  456.  
  457. combined_df_spread_vanilla <- combined_df %>%
  458.   select(showid, show_date, song, rating) %>%
  459.   distinct() %>%
  460.   mutate(val = 1) %>%
  461.   spread(key = song, value = val, fill = 0) %>%
  462.   dplyr::select(-showid, -show_date)
  463.  
  464. ## fit model ##
  465.  
  466. index <- createDataPartition(combined_df_spread_vanilla$rating, p = 0.8, list = FALSE)
  467. train_data <- combined_df_spread_vanilla[index, ]
  468. test_data <- combined_df_spread_vanilla[-index, ]
  469.  
  470. x_train <- as.matrix(train_data[, -1])
  471. y_train <- train_data$rating
  472. x_test <- as.matrix(test_data[, -1])
  473. y_test <- test_data$rating
  474.  
  475. lambda_values_vanilla_oos <- 10^seq(3, -2, length = 500)
  476. ridge_model_vanilla_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE)
  477. cv_ridge_vanilla_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE, nfolds = 250)
  478. best_lambda_vanilla_oos <- cv_ridge_vanilla_oos$lambda.min
  479. coefficients_vanilla_oos <- coef(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos)
  480.  
  481. coef_vector_vanilla_oos <- as.vector(coefficients_vanilla_oos)
  482. feature_names_vanilla_oos <- c("Intercept", colnames(train_data)[-1])
  483.  
  484. coef_df_vanilla_oos <- data.frame(
  485.   Feature = feature_names_vanilla_oos,
  486.   Coefficient = coef_vector_vanilla_oos,
  487.   stringsAsFactors = FALSE
  488. )
  489.  
  490. coef_df_vanilla_oos <- coef_df_vanilla_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))
  491.  
  492. y_pred <- predict(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos, newx = x_test)
  493.  
  494. # Calculate RMSE
  495. rmse <- sqrt(mean((y_test - y_pred)^2))
  496. cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")
  497.  
  498. # Calculate R-squared
  499. rss <- sum((y_test - y_pred)^2)
  500. tss <- sum((y_test - mean(y_test))^2)
  501. r_squared <- 1 - (rss / tss)
  502. cat("R-squared on Test Set: ", r_squared, "\n")
  503.  
  504. # Create a dataframe for actual vs predicted values
  505. pred_vs_actual_df <- data.frame(
  506.   Actual = y_test,
  507.   Predicted = as.vector(y_pred)
  508. )
  509.  
  510. # Plot predicted vs actual values
  511. vanilla_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
  512.   geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
  513.   geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
  514.   labs(
  515.     title = "Predicted vs Actual Ratings",
  516.     subtitle = 'Vanilla Model | Out-of-Sample Prediction | 1 point = 1 show',
  517.     x = "Predicted Rating",
  518.     y = "Actual Rating"
  519.   ) +
  520.   theme_rs3 +
  521.   scale_x_continuous(limits = c(1, 5)) +
  522.   scale_y_continuous(limits = c(1, 5))
  523.  
  524. ggsave('vanilla_pred_vs_actual_plot.png', vanilla_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)
  525.  
  526. ### --- --- --- --- --- --- --- --- --- --- ---
  527. #### test predictiveness (tour-adjusted) ####
  528. ### --- --- --- --- --- --- --- --- --- --- ---
  529.  
  530. ## eligible tours ##
  531.  
  532. eligible_tours <- shows_data %>%
  533.   dplyr::select(showid, tour_name) %>%
  534.   group_by(tour_name) %>%
  535.   summarise(n_shows = n_distinct(showid)) %>%
  536.   filter(n_shows >= 6)
  537.  
  538. ## spread ##
  539.  
  540. combined_df_spread_tour_adj <- combined_df %>%
  541.   select(showid, show_date, song, rating) %>%
  542.   left_join(
  543.     y = shows_data %>% dplyr::select(showid, tour_name),
  544.     by = 'showid'
  545.   ) %>%
  546.   distinct() %>%
  547.   filter(tour_name %in% eligible_tours$tour_name) %>%
  548.   mutate(val = 1) %>%
  549.   spread(key = song, value = val, fill = 0) %>%
  550.   mutate(val = 1) %>%
  551.   spread(key = tour_name, value = val, fill = 0) %>%
  552.   dplyr::select(-showid, -show_date)
  553.  
  554. ## fit model ##
  555.  
  556. index <- createDataPartition(combined_df_spread_tour_adj$rating, p = 0.8, list = FALSE)
  557. train_data <- combined_df_spread_tour_adj[index, ]
  558. test_data <- combined_df_spread_tour_adj[-index, ]
  559.  
  560. x_train <- as.matrix(train_data[, -1])
  561. y_train <- train_data$rating
  562. x_test <- as.matrix(test_data[, -1])
  563. y_test <- test_data$rating
  564.  
  565. lambda_values_tour_adj_oos <- 10^seq(3, -2, length = 500)
  566. ridge_model_tour_adj_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE)
  567. cv_ridge_tour_adj_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE, nfolds = 250)
  568. best_lambda_tour_adj_oos <- cv_ridge_tour_adj_oos$lambda.min
  569. coefficients_tour_adj_oos <- coef(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos)
  570.  
  571. coef_vector_tour_adj_oos <- as.vector(coefficients_tour_adj_oos)
  572. feature_names_tour_adj_oos <- c("Intercept", colnames(train_data)[-1])
  573.  
  574. coef_df_tour_adj_oos <- data.frame(
  575.   Feature = feature_names_tour_adj_oos,
  576.   Coefficient = coef_vector_tour_adj_oos,
  577.   stringsAsFactors = FALSE
  578. )
  579.  
  580. coef_df_tour_adj_oos <- coef_df_tour_adj_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))
  581.  
  582. y_pred <- predict(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos, newx = x_test)
  583.  
  584. # Calculate RMSE
  585. rmse <- sqrt(mean((y_test - y_pred)^2))
  586. cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")
  587.  
  588. # Calculate R-squared
  589. rss <- sum((y_test - y_pred)^2)
  590. tss <- sum((y_test - mean(y_test))^2)
  591. r_squared <- 1 - (rss / tss)
  592. cat("R-squared on Test Set: ", r_squared, "\n")
  593.  
  594. # Create a dataframe for actual vs predicted values
  595. pred_vs_actual_df <- data.frame(
  596.   Actual = y_test,
  597.   Predicted = as.vector(y_pred)
  598. )
  599.  
  600. # Plot predicted vs actual values
  601. tour_adj_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
  602.   geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
  603.   geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
  604.   labs(
  605.     title = "Predicted vs Actual Ratings",
  606.     subtitle = 'Tour-Adjusted Model | Out-of-Sample Prediction | 1 point = 1 show',
  607.     x = "Predicted Rating",
  608.     y = "Actual Rating"
  609.   ) +
  610.   theme_rs3 +
  611.   scale_x_continuous(limits = c(1, 5)) +
  612.   scale_y_continuous(limits = c(1, 5))
  613.  
  614. ggsave('tour_adj_pred_vs_actual_plot.png', tour_adj_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)
  615.  
  616.  
  617. ### --- --- --- --- --- --- --- --- --- --- ---
  618. #### combine all 3 coefficient flavors ####
  619. ### --- --- --- --- --- --- --- --- --- --- ---
  620.  
  621. clipr::write_clip(
  622.   coef_df_vanilla %>%
  623.     filter(Feature != 'Intercept') %>%
  624.     dplyr::select(Feature, n_shows, vanilla_plus_minus = Coefficient) %>%
  625.     left_join(y = coef_df_yr_adj %>%
  626.                 filter(Feature != 'Intercept', n_shows > 0) %>%
  627.                 dplyr::select(Feature, yr_adj_plus_minus = Coefficient), by = 'Feature') %>%
  628.     left_join(y = coef_df_tour_adj %>%
  629.                 filter(Feature != 'Intercept', n_shows > 0) %>%
  630.                 dplyr::select(Feature, tour_adj_plus_minus = Coefficient), by = 'Feature')
  631. )
  632.  
  633.  
  634.  
  635.  
  636.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement