Untitled

### --- --- --- --- --- --- --- --- --- --- ---
#### load libraries ####
### --- --- --- --- --- --- --- --- --- --- ---

library(httr)
library(jsonlite)
library(rvest)
library(glmnet)

### --- --- --- --- --- --- --- --- --- --- ---
#### pull shows, songs, setlists data from API ####
### --- --- --- --- --- --- --- --- --- --- ---

# Set API Key and base URL
api_key <- "XXXXXXXXX"
base_url <- "https://api.phish.net/v5"

## Shows ##

url <- 'https://api.phish.net/v5/shows/artist/phish.json?order_by=showdate&apikey=XXXXXXXXX'
response <- GET(url)
shows_data <- fromJSON(content(response, "text"), flatten = TRUE)
shows_data <- shows_data$data

## Songs ##

url <- 'https://api.phish.net/v5/songs.json?apikey=XXXXXXXXX'
response <- GET(url)
songs_data <- fromJSON(content(response, "text"), flatten = TRUE)
songs_data <- songs_data$data

## Setlists ##

#url <- 'https://api.phish.net/v5/setlists/showdate/1997-11-22.json?apikey=XXXXXXXXX'
url <- 'https://api.phish.net/v5/setlists.json?apikey=XXXXXXXXX'
response <- GET(url)
setlists_data <- fromJSON(content(response, "text"), flatten = TRUE)
setlists_data <- setlists_data$data

### --- --- --- --- --- --- --- --- --- --- ---
#### scrape ratings data from web ####
### --- --- --- --- --- --- --- --- --- --- ---

ratings_df <- data.frame()

for(i in 1984:2023) {

  url <- paste0("https://phish.net/music/ratings/", i)
  page <- read_html(url)

  ratings <- page %>%
    html_nodes('td') %>%
    html_text()

  ratings <- matrix(ratings, ncol = 6, byrow = TRUE)
  ratings <- as.data.frame(ratings, stringsAsFactors = FALSE)

  show_links <- page %>%
    html_nodes(".sortable a") %>%
    html_attr("href")

  ratings <- bind_cols(ratings, show_links)

  colnames(ratings) <- c('rating', 'show_date', 'venue', 'city', 'state', 'country', 'link')

  ratings_df <- bind_rows(ratings_df, ratings)

}

### --- --- --- --- --- --- --- --- --- --- ---
#### join ratings and setlist data ####
### --- --- --- --- --- --- --- --- --- --- ---

combined_df <- ratings_df %>%
  mutate(link = paste0('https://phish.net', link)) %>%

  inner_join(y = shows_data %>%
               filter(exclude_from_stats == 0, artistid == 1) %>%
               select(showid, permalink),
             by = c('link' = 'permalink')) %>%

  inner_join(y = setlists_data %>%
               filter(artistid == 1) %>%
               select(showid, song),
             by = 'showid') %>%

  mutate(
    rating = as.numeric(rating),
    yr = lubridate::year(show_date))

## identify songs with at least 5 appearances ##

eligible_songs <- combined_df %>%
  group_by(song) %>%
  summarise(
    n_shows = n_distinct(showid),
    avg_rating = mean(rating, na.rm = TRUE))
#  filter(n_shows >= 25)

#combined_df <- combined_df %>%
#  filter(song %in% eligible_songs$song)

### --- --- --- --- --- --- --- --- --- --- ---
#### fit vanilla model ####
### --- --- --- --- --- --- --- --- --- --- ---

## spread ##

combined_df_spread_vanilla <- combined_df %>%
  select(showid, show_date, song, rating) %>%
  distinct() %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date)

## fit model ##

x_vanilla <- as.matrix(combined_df_spread_vanilla[, -1])
y_vanilla <- combined_df_spread_vanilla$rating

lambda_values_vanilla <- 10^seq(3, -2, length = 500)
ridge_model_vanilla <- glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE)
cv_ridge_vanilla <- cv.glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE, nfolds = 250)
best_lambda_vanilla <- cv_ridge_vanilla$lambda.min
coefficients_vanilla <- coef(ridge_model_vanilla, s = best_lambda_vanilla)

coef_vector_vanilla <- as.vector(coefficients_vanilla)
feature_names_vanilla <- c("Intercept", colnames(combined_df_spread_vanilla)[-1])

coef_df_vanilla <- data.frame(
  Feature = feature_names_vanilla,
  Coefficient = coef_vector_vanilla,
  stringsAsFactors = FALSE
)

coef_df_vanilla <- coef_df_vanilla %>% left_join(eligible_songs, by = c('Feature' = 'song'))

View(coef_df_vanilla)

### --- --- --- --- --- --- --- --- --- --- ---
#### [plot] top/bottom songs: vanilla model ####
### --- --- --- --- --- --- --- --- --- --- ---

top_20_songs_vanilla_plot <- coef_df_vanilla %>%
  filter(Feature != 'Intercept') %>%
  top_n(20, Coefficient) %>%
  ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'dodgerblue') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
            size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  theme_rs2 +
  labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Vanilla Model')

ggsave('top_20_songs_vanilla_plot.png', top_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)

bottom_20_songs_vanilla_plot <- coef_df_vanilla %>%
  filter(Feature != 'Intercept') %>%
  top_n(20, desc(Coefficient)) %>%
  ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'red') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
            size = 2.75, fontface = 'bold', color = 'red') +
  theme_rs2 +
  labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Vanilla Model')

ggsave('bottom_20_songs_vanilla_plot.png', bottom_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)

### --- --- --- --- --- --- --- --- --- --- ---
#### fit year-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

## spread ##

combined_df_spread_yr_adj <- combined_df %>%
  select(showid, show_date, song, rating, yr) %>%
  distinct() %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  mutate(val = 1) %>%
  spread(key = yr, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date)

## fit model ##

x_yr_adj <- as.matrix(combined_df_spread_yr_adj[, -1])
y_yr_adj <- combined_df_spread_yr_adj$rating

lambda_values_yr_adj <- 10^seq(3, -2, length = 500)
ridge_model_yr_adj <- glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE)
cv_ridge_yr_adj <- cv.glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE, nfolds = 250)
best_lambda_yr_adj <- cv_ridge_yr_adj$lambda.min
coefficients_yr_adj <- coef(ridge_model_yr_adj, s = best_lambda_yr_adj)

coef_vector_yr_adj <- as.vector(coefficients_yr_adj)
feature_names_yr_adj <- c("Intercept", colnames(combined_df_spread_yr_adj)[-1])

coef_df_yr_adj <- data.frame(
  Feature = feature_names_yr_adj,
  Coefficient = coef_vector_yr_adj,
  stringsAsFactors = FALSE
)

coef_df_yr_adj <- coef_df_yr_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))

View(coef_df_yr_adj)

### --- --- --- --- --- --- --- --- --- --- ---
#### [plot] top/bottom songs: year-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

top_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
  filter(Feature != 'Intercept', n_shows > 0) %>%
  top_n(20, Coefficient) %>%
  ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'dodgerblue') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
            size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  theme_rs2 +
  labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Year-Adjusted Model')

ggsave('top_20_songs_yr_adj_plot.png', top_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)


bottom_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
  filter(Feature != 'Intercept', n_shows > 0) %>%
  top_n(20, desc(Coefficient)) %>%
  ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'red') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
            size = 2.75, fontface = 'bold', color = 'red') +
  theme_rs2 +
  labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Year-Adjusted Model')

ggsave('bottom_20_songs_yr_adj_plot.png', bottom_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)


### --- --- --- --- --- --- --- --- --- --- ---
#### [plot] year contributions: year-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

yr_adj_plot_df <- coef_df_yr_adj %>%
  filter(Feature != 'Intercept', is.na(n_shows)) %>%
  mutate(Feature = as.numeric(Feature)) %>%
  filter(Feature >= 1983, Feature <= 2023)

yr_adj_plot <- yr_adj_plot_df %>%
  ggplot(aes(x = Feature, y = Coefficient)) +
  geom_col(fill = '#1DB954') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01), vjust = ifelse(Coefficient >= 0, -0.5, 1.5)),
            size = 2.05, fontface = 'bold', color = '#1DB954') +
  theme_rs3 +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_continuous(breaks = seq(1984, 2023, 1)) +
  labs(title = 'Phish Setlist Rating Plus-Minus: Years', subtitle = 'Year-Adjusted Model', x = 'Show Year', y = 'Year RAPM')

ggsave('yr_adj_plot.png', yr_adj_plot, dpi = 700, height = 4.5, width = 8.5)


### --- --- --- --- --- --- --- --- --- --- ---
#### fit tour-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

## eligible tours ##

eligible_tours <- shows_data %>%
  dplyr::select(showid, tour_name) %>%
  group_by(tour_name) %>%
  summarise(n_shows = n_distinct(showid)) %>%
  filter(n_shows >= 6)

## spread ##

combined_df_spread_tour_adj <- combined_df %>%
  select(showid, show_date, song, rating) %>%
  left_join(
    y = shows_data %>% dplyr::select(showid, tour_name),
    by = 'showid'
  ) %>%
  distinct() %>%
  filter(tour_name %in% eligible_tours$tour_name) %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  mutate(val = 1) %>%
  spread(key = tour_name, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date)

## fit model ##

x_tour_adj <- as.matrix(combined_df_spread_tour_adj[, -1])
y_tour_adj <- combined_df_spread_tour_adj$rating

lambda_values_tour_adj <- 10^seq(3, -2, length = 500)
ridge_model_tour_adj <- glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE)
cv_ridge_tour_adj <- cv.glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE, nfolds = 250)
best_lambda_tour_adj <- cv_ridge_tour_adj$lambda.min
coefficients_tour_adj <- coef(ridge_model_tour_adj, s = best_lambda_tour_adj)

coef_vector_tour_adj <- as.vector(coefficients_tour_adj)
feature_names_tour_adj <- c("Intercept", colnames(combined_df_spread_tour_adj)[-1])

coef_df_tour_adj <- data.frame(
  Feature = feature_names_tour_adj,
  Coefficient = coef_vector_tour_adj,
  stringsAsFactors = FALSE
)

coef_df_tour_adj <- coef_df_tour_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))

View(coef_df_tour_adj)

### --- --- --- --- --- --- --- --- --- --- ---
#### [plot] top/bottom tours: tour-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

top_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
  filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
  top_n(20, Coefficient) %>%
  ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'dodgerblue') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.0075,
            size = 2.75, fontface = 'bold', color = 'dodgerblue') +
  theme_rs2 +
  labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Top 20 Most Impactful Tours | Tour-Adjusted Model')

ggsave('top_20_tours_tour_adj_plot.png', top_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)


bottom_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
  filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
  top_n(20, desc(Coefficient)) %>%
  ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
  coord_flip() +
  geom_col(fill = 'red') +
  geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0095,
            size = 2.75, fontface = 'bold', color = 'red') +
  theme_rs2 +
  labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Bottom 20 Most Impactful Tours | Tour-Adjusted Model')

ggsave('bottom_20_tours_tour_adj_plot.png', bottom_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)


### --- --- --- --- --- --- --- --- --- --- ---
#### [plot] ratings for most played songs: tour-adjusted model ####
### --- --- --- --- --- --- --- --- --- --- ---

most_played_tour_adj_plot <- coef_df_tour_adj %>%
  filter(Feature != 'Intercept') %>%
  top_n(40, n_shows) %>%
  dplyr::select(-avg_rating) %>%
  gather(metric, value, -Feature) %>%
  mutate(metric = ifelse(metric == 'Coefficient', 'Plus-Minus', 'Number of Shows')) %>%
  ggplot(aes(x = reorder(Feature, value, max), y = value, fill = metric)) +
  facet_wrap(~metric, scales = 'free_x') +
  coord_flip() +
  geom_col() +
  geom_text(aes(
    color = metric,
    label = ifelse(value > 1, scales::comma(value, accuracy = 1), scales::comma(value, accuracy = .01)),
    hjust = ifelse(value > 0, -0.1, 1.1)
    ),
    size = 2.75, fontface = 'bold') +
  scale_y_continuous(expand = expansion(mult = c(.1, .1))) +
  scale_color_manual(values = c('#ff8d1e', 'dodgerblue')) +
  scale_fill_manual(values = c('#ff8d1e', 'dodgerblue')) +
  theme_rs3 +
  theme(legend.position = 'none') +
  labs(y = '', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 40 Most Played Songs | Tour-Adjusted Model')

ggsave('most_played_tour_adj_plot.png', most_played_tour_adj_plot, dpi = 700, height = 7.5, width = 8)


### --- --- --- --- --- --- --- --- --- --- ---
#### comparing models ####
### --- --- --- --- --- --- --- --- --- --- ---

coef_df_vanilla %>%
  dplyr::select(Feature, n_shows, vanilla = Coefficient) %>%
  inner_join(y = coef_df_yr_adj %>% dplyr::select(Feature, yr_adjusted = Coefficient), by = 'Feature') %>%
  mutate(abs_diff = abs(vanilla-yr_adjusted))

combined_df %>% filter(song == 'Fluffhead') %>% group_by(yr) %>% summarise(n_shows = n_distinct(showid), avg_rating = mean(rating, na.rm = TRUE))

### --- --- --- --- --- --- --- --- --- --- ---
#### comparing to average rating ####
### --- --- --- --- --- --- --- --- --- --- ---

coef_df_tour_adj %>%
  filter(n_shows > 0) %>%
  mutate(pctile_coef = percent_rank(Coefficient), pctile_avg_rating = percent_rank(avg_rating)) %>%
  mutate(pctile_diff = abs(pctile_coef - pctile_avg_rating))

cor(x = coef_df_tour_adj$Coefficient, y = coef_df_tour_adj$avg_rating, use = 'complete.obs')^2

avg_rating_vs_plus_minus_plot <- coef_df_tour_adj %>%
  filter(Feature != 'Intercept', n_shows > 25) %>%
  mutate(lab = ifelse(avg_rating < 3.6 | avg_rating > 4.1 | Coefficient > 0.08 | Coefficient < -0.08, Feature, NA)) %>%
  ggplot(aes(x = avg_rating, y = Coefficient)) +
  geom_point(color = 'dodgerblue') +
  geom_text_repel(aes(label = lab), size = 3) +
  theme_rs3 +
  scale_y_continuous(limits = c(-0.2, 0.2)) +
  labs(x = 'Average Rating For Shows Where This Song Is Played', y = 'Plus-Minus Contribution',
       caption = 'Songs with 25+ Appearances',
       title = 'Average Rating vs. Plus-Minus Contribution')

ggsave('avg_rating_vs_plus_minus_plot.png', avg_rating_vs_plus_minus_plot, dpi = 700, height = 6, width = 7)

fluffhead_yrs <- combined_df %>%
  select(showid, show_date, song, rating, yr) %>%
  distinct() %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date) %>%
  mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
  group_by(yr, fluffhead_played) %>%
  summarise(avg_rating = mean(rating, na.rm = TRUE), n_shows = n()) %>%
  dplyr::select(yr, fluffhead_played, n_shows) %>%
  spread(key = fluffhead_played, value = n_shows) %>%
  filter(No > 3, Yes > 3)


fluffhead_plot <- combined_df %>%
  select(showid, show_date, song, rating, yr) %>%
  distinct() %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date) %>%
  mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
  group_by(yr, fluffhead_played) %>%
  summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
  filter(yr %in% fluffhead_yrs$yr) %>%
  ggplot(aes(x = as.factor(yr), y = avg_rating, color = as.factor(fluffhead_played))) +
  coord_flip() +
  geom_col(position = 'dodge', width = 0.8) +
  scale_fill_manual(values = c('red', 'dodgerblue'), name = 'Was Fluffhead Played?') +
  theme_rs2 +
  theme(legend.position = 'bottom') +
  labs(x = '', y = 'Average Rating',
       subtitle = 'Years with 3+ Fluffhead Appearances and Non-Appearances',
       title = 'Average Show Rating by Year: With or Without Fluffhead')

ggsave('fluffhead_plot.png', fluffhead_plot, dpi = 700, height = 6, width = 7.5)


### --- --- --- --- --- --- --- --- --- --- ---
#### test predictiveness (vanilla) ####
### --- --- --- --- --- --- --- --- --- --- ---

## spread ##

combined_df_spread_vanilla <- combined_df %>%
  select(showid, show_date, song, rating) %>%
  distinct() %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date)

## fit model ##

index <- createDataPartition(combined_df_spread_vanilla$rating, p = 0.8, list = FALSE)
train_data <- combined_df_spread_vanilla[index, ]
test_data <- combined_df_spread_vanilla[-index, ]

x_train <- as.matrix(train_data[, -1])
y_train <- train_data$rating
x_test <- as.matrix(test_data[, -1])
y_test <- test_data$rating

lambda_values_vanilla_oos <- 10^seq(3, -2, length = 500)
ridge_model_vanilla_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE)
cv_ridge_vanilla_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE, nfolds = 250)
best_lambda_vanilla_oos <- cv_ridge_vanilla_oos$lambda.min
coefficients_vanilla_oos <- coef(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos)

coef_vector_vanilla_oos <- as.vector(coefficients_vanilla_oos)
feature_names_vanilla_oos <- c("Intercept", colnames(train_data)[-1])

coef_df_vanilla_oos <- data.frame(
  Feature = feature_names_vanilla_oos,
  Coefficient = coef_vector_vanilla_oos,
  stringsAsFactors = FALSE
)

coef_df_vanilla_oos <- coef_df_vanilla_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))

y_pred <- predict(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos, newx = x_test)

# Calculate RMSE
rmse <- sqrt(mean((y_test - y_pred)^2))
cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")

# Calculate R-squared
rss <- sum((y_test - y_pred)^2)
tss <- sum((y_test - mean(y_test))^2)
r_squared <- 1 - (rss / tss)
cat("R-squared on Test Set: ", r_squared, "\n")

# Create a dataframe for actual vs predicted values
pred_vs_actual_df <- data.frame(
  Actual = y_test,
  Predicted = as.vector(y_pred)
)

# Plot predicted vs actual values
vanilla_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
  geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
  geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
  labs(
    title = "Predicted vs Actual Ratings",
    subtitle = 'Vanilla Model | Out-of-Sample Prediction | 1 point = 1 show',
    x = "Predicted Rating",
    y = "Actual Rating"
  ) +
  theme_rs3 +
  scale_x_continuous(limits = c(1, 5)) +
  scale_y_continuous(limits = c(1, 5))

ggsave('vanilla_pred_vs_actual_plot.png', vanilla_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)

### --- --- --- --- --- --- --- --- --- --- ---
#### test predictiveness (tour-adjusted) ####
### --- --- --- --- --- --- --- --- --- --- ---

## eligible tours ##

eligible_tours <- shows_data %>%
  dplyr::select(showid, tour_name) %>%
  group_by(tour_name) %>%
  summarise(n_shows = n_distinct(showid)) %>%
  filter(n_shows >= 6)

## spread ##

combined_df_spread_tour_adj <- combined_df %>%
  select(showid, show_date, song, rating) %>%
  left_join(
    y = shows_data %>% dplyr::select(showid, tour_name),
    by = 'showid'
  ) %>%
  distinct() %>%
  filter(tour_name %in% eligible_tours$tour_name) %>%
  mutate(val = 1) %>%
  spread(key = song, value = val, fill = 0) %>%
  mutate(val = 1) %>%
  spread(key = tour_name, value = val, fill = 0) %>%
  dplyr::select(-showid, -show_date)

## fit model ##

index <- createDataPartition(combined_df_spread_tour_adj$rating, p = 0.8, list = FALSE)
train_data <- combined_df_spread_tour_adj[index, ]
test_data <- combined_df_spread_tour_adj[-index, ]

x_train <- as.matrix(train_data[, -1])
y_train <- train_data$rating
x_test <- as.matrix(test_data[, -1])
y_test <- test_data$rating

lambda_values_tour_adj_oos <- 10^seq(3, -2, length = 500)
ridge_model_tour_adj_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE)
cv_ridge_tour_adj_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE, nfolds = 250)
best_lambda_tour_adj_oos <- cv_ridge_tour_adj_oos$lambda.min
coefficients_tour_adj_oos <- coef(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos)

coef_vector_tour_adj_oos <- as.vector(coefficients_tour_adj_oos)
feature_names_tour_adj_oos <- c("Intercept", colnames(train_data)[-1])

coef_df_tour_adj_oos <- data.frame(
  Feature = feature_names_tour_adj_oos,
  Coefficient = coef_vector_tour_adj_oos,
  stringsAsFactors = FALSE
)

coef_df_tour_adj_oos <- coef_df_tour_adj_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))

y_pred <- predict(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos, newx = x_test)

# Calculate RMSE
rmse <- sqrt(mean((y_test - y_pred)^2))
cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")

# Calculate R-squared
rss <- sum((y_test - y_pred)^2)
tss <- sum((y_test - mean(y_test))^2)
r_squared <- 1 - (rss / tss)
cat("R-squared on Test Set: ", r_squared, "\n")

# Create a dataframe for actual vs predicted values
pred_vs_actual_df <- data.frame(
  Actual = y_test,
  Predicted = as.vector(y_pred)
)

# Plot predicted vs actual values
tour_adj_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
  geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
  geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
  labs(
    title = "Predicted vs Actual Ratings",
    subtitle = 'Tour-Adjusted Model | Out-of-Sample Prediction | 1 point = 1 show',
    x = "Predicted Rating",
    y = "Actual Rating"
  ) +
  theme_rs3 +
  scale_x_continuous(limits = c(1, 5)) +
  scale_y_continuous(limits = c(1, 5))

ggsave('tour_adj_pred_vs_actual_plot.png', tour_adj_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)


### --- --- --- --- --- --- --- --- --- --- ---
#### combine all 3 coefficient flavors ####
### --- --- --- --- --- --- --- --- --- --- ---

clipr::write_clip(
  coef_df_vanilla %>%
    filter(Feature != 'Intercept') %>%
    dplyr::select(Feature, n_shows, vanilla_plus_minus = Coefficient) %>%
    left_join(y = coef_df_yr_adj %>%
                filter(Feature != 'Intercept', n_shows > 0) %>%
                dplyr::select(Feature, yr_adj_plus_minus = Coefficient), by = 'Feature') %>%
    left_join(y = coef_df_tour_adj %>%
                filter(Feature != 'Intercept', n_shows > 0) %>%
                dplyr::select(Feature, tour_adj_plus_minus = Coefficient), by = 'Feature')
)