Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### load libraries ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- library(httr)
- library(jsonlite)
- library(rvest)
- library(glmnet)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### pull shows, songs, setlists data from API ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- # Set API Key and base URL
- api_key <- "XXXXXXXXX"
- base_url <- "https://api.phish.net/v5"
- ## Shows ##
- url <- 'https://api.phish.net/v5/shows/artist/phish.json?order_by=showdate&apikey=XXXXXXXXX'
- response <- GET(url)
- shows_data <- fromJSON(content(response, "text"), flatten = TRUE)
- shows_data <- shows_data$data
- ## Songs ##
- url <- 'https://api.phish.net/v5/songs.json?apikey=XXXXXXXXX'
- response <- GET(url)
- songs_data <- fromJSON(content(response, "text"), flatten = TRUE)
- songs_data <- songs_data$data
- ## Setlists ##
- #url <- 'https://api.phish.net/v5/setlists/showdate/1997-11-22.json?apikey=XXXXXXXXX'
- url <- 'https://api.phish.net/v5/setlists.json?apikey=XXXXXXXXX'
- response <- GET(url)
- setlists_data <- fromJSON(content(response, "text"), flatten = TRUE)
- setlists_data <- setlists_data$data
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### scrape ratings data from web ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ratings_df <- data.frame()
- for(i in 1984:2023) {
- url <- paste0("https://phish.net/music/ratings/", i)
- page <- read_html(url)
- ratings <- page %>%
- html_nodes('td') %>%
- html_text()
- ratings <- matrix(ratings, ncol = 6, byrow = TRUE)
- ratings <- as.data.frame(ratings, stringsAsFactors = FALSE)
- show_links <- page %>%
- html_nodes(".sortable a") %>%
- html_attr("href")
- ratings <- bind_cols(ratings, show_links)
- colnames(ratings) <- c('rating', 'show_date', 'venue', 'city', 'state', 'country', 'link')
- ratings_df <- bind_rows(ratings_df, ratings)
- }
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### join ratings and setlist data ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- combined_df <- ratings_df %>%
- mutate(link = paste0('https://phish.net', link)) %>%
- inner_join(y = shows_data %>%
- filter(exclude_from_stats == 0, artistid == 1) %>%
- select(showid, permalink),
- by = c('link' = 'permalink')) %>%
- inner_join(y = setlists_data %>%
- filter(artistid == 1) %>%
- select(showid, song),
- by = 'showid') %>%
- mutate(
- rating = as.numeric(rating),
- yr = lubridate::year(show_date))
- ## identify songs with at least 5 appearances ##
- eligible_songs <- combined_df %>%
- group_by(song) %>%
- summarise(
- n_shows = n_distinct(showid),
- avg_rating = mean(rating, na.rm = TRUE))
- # filter(n_shows >= 25)
- #combined_df <- combined_df %>%
- # filter(song %in% eligible_songs$song)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### fit vanilla model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ## spread ##
- combined_df_spread_vanilla <- combined_df %>%
- select(showid, show_date, song, rating) %>%
- distinct() %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date)
- ## fit model ##
- x_vanilla <- as.matrix(combined_df_spread_vanilla[, -1])
- y_vanilla <- combined_df_spread_vanilla$rating
- lambda_values_vanilla <- 10^seq(3, -2, length = 500)
- ridge_model_vanilla <- glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE)
- cv_ridge_vanilla <- cv.glmnet(x_vanilla, y_vanilla, alpha = 0, lambda = lambda_values_vanilla, standardize = FALSE, nfolds = 250)
- best_lambda_vanilla <- cv_ridge_vanilla$lambda.min
- coefficients_vanilla <- coef(ridge_model_vanilla, s = best_lambda_vanilla)
- coef_vector_vanilla <- as.vector(coefficients_vanilla)
- feature_names_vanilla <- c("Intercept", colnames(combined_df_spread_vanilla)[-1])
- coef_df_vanilla <- data.frame(
- Feature = feature_names_vanilla,
- Coefficient = coef_vector_vanilla,
- stringsAsFactors = FALSE
- )
- coef_df_vanilla <- coef_df_vanilla %>% left_join(eligible_songs, by = c('Feature' = 'song'))
- View(coef_df_vanilla)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### [plot] top/bottom songs: vanilla model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- top_20_songs_vanilla_plot <- coef_df_vanilla %>%
- filter(Feature != 'Intercept') %>%
- top_n(20, Coefficient) %>%
- ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'dodgerblue') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
- size = 2.75, fontface = 'bold', color = 'dodgerblue') +
- theme_rs2 +
- labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Vanilla Model')
- ggsave('top_20_songs_vanilla_plot.png', top_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)
- bottom_20_songs_vanilla_plot <- coef_df_vanilla %>%
- filter(Feature != 'Intercept') %>%
- top_n(20, desc(Coefficient)) %>%
- ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'red') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
- size = 2.75, fontface = 'bold', color = 'red') +
- theme_rs2 +
- labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Vanilla Model')
- ggsave('bottom_20_songs_vanilla_plot.png', bottom_20_songs_vanilla_plot, dpi = 700, height = 4.5, width = 8)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### fit year-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ## spread ##
- combined_df_spread_yr_adj <- combined_df %>%
- select(showid, show_date, song, rating, yr) %>%
- distinct() %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- mutate(val = 1) %>%
- spread(key = yr, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date)
- ## fit model ##
- x_yr_adj <- as.matrix(combined_df_spread_yr_adj[, -1])
- y_yr_adj <- combined_df_spread_yr_adj$rating
- lambda_values_yr_adj <- 10^seq(3, -2, length = 500)
- ridge_model_yr_adj <- glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE)
- cv_ridge_yr_adj <- cv.glmnet(x_yr_adj, y_yr_adj, alpha = 0, lambda = lambda_values_yr_adj, standardize = FALSE, nfolds = 250)
- best_lambda_yr_adj <- cv_ridge_yr_adj$lambda.min
- coefficients_yr_adj <- coef(ridge_model_yr_adj, s = best_lambda_yr_adj)
- coef_vector_yr_adj <- as.vector(coefficients_yr_adj)
- feature_names_yr_adj <- c("Intercept", colnames(combined_df_spread_yr_adj)[-1])
- coef_df_yr_adj <- data.frame(
- Feature = feature_names_yr_adj,
- Coefficient = coef_vector_yr_adj,
- stringsAsFactors = FALSE
- )
- coef_df_yr_adj <- coef_df_yr_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))
- View(coef_df_yr_adj)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### [plot] top/bottom songs: year-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- top_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
- filter(Feature != 'Intercept', n_shows > 0) %>%
- top_n(20, Coefficient) %>%
- ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'dodgerblue') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.005,
- size = 2.75, fontface = 'bold', color = 'dodgerblue') +
- theme_rs2 +
- labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 20 Most Impactful Songs | Year-Adjusted Model')
- ggsave('top_20_songs_yr_adj_plot.png', top_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)
- bottom_20_songs_yr_adj_plot <- coef_df_yr_adj %>%
- filter(Feature != 'Intercept', n_shows > 0) %>%
- top_n(20, desc(Coefficient)) %>%
- ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'red') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0035,
- size = 2.75, fontface = 'bold', color = 'red') +
- theme_rs2 +
- labs(y = 'Song RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Bottom 20 Most Impactful Songs | Year-Adjusted Model')
- ggsave('bottom_20_songs_yr_adj_plot.png', bottom_20_songs_yr_adj_plot, dpi = 700, height = 4.5, width = 8)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### [plot] year contributions: year-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- yr_adj_plot_df <- coef_df_yr_adj %>%
- filter(Feature != 'Intercept', is.na(n_shows)) %>%
- mutate(Feature = as.numeric(Feature)) %>%
- filter(Feature >= 1983, Feature <= 2023)
- yr_adj_plot <- yr_adj_plot_df %>%
- ggplot(aes(x = Feature, y = Coefficient)) +
- geom_col(fill = '#1DB954') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01), vjust = ifelse(Coefficient >= 0, -0.5, 1.5)),
- size = 2.05, fontface = 'bold', color = '#1DB954') +
- theme_rs3 +
- theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
- scale_x_continuous(breaks = seq(1984, 2023, 1)) +
- labs(title = 'Phish Setlist Rating Plus-Minus: Years', subtitle = 'Year-Adjusted Model', x = 'Show Year', y = 'Year RAPM')
- ggsave('yr_adj_plot.png', yr_adj_plot, dpi = 700, height = 4.5, width = 8.5)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### fit tour-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ## eligible tours ##
- eligible_tours <- shows_data %>%
- dplyr::select(showid, tour_name) %>%
- group_by(tour_name) %>%
- summarise(n_shows = n_distinct(showid)) %>%
- filter(n_shows >= 6)
- ## spread ##
- combined_df_spread_tour_adj <- combined_df %>%
- select(showid, show_date, song, rating) %>%
- left_join(
- y = shows_data %>% dplyr::select(showid, tour_name),
- by = 'showid'
- ) %>%
- distinct() %>%
- filter(tour_name %in% eligible_tours$tour_name) %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- mutate(val = 1) %>%
- spread(key = tour_name, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date)
- ## fit model ##
- x_tour_adj <- as.matrix(combined_df_spread_tour_adj[, -1])
- y_tour_adj <- combined_df_spread_tour_adj$rating
- lambda_values_tour_adj <- 10^seq(3, -2, length = 500)
- ridge_model_tour_adj <- glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE)
- cv_ridge_tour_adj <- cv.glmnet(x_tour_adj, y_tour_adj, alpha = 0, lambda = lambda_values_tour_adj, standardize = FALSE, nfolds = 250)
- best_lambda_tour_adj <- cv_ridge_tour_adj$lambda.min
- coefficients_tour_adj <- coef(ridge_model_tour_adj, s = best_lambda_tour_adj)
- coef_vector_tour_adj <- as.vector(coefficients_tour_adj)
- feature_names_tour_adj <- c("Intercept", colnames(combined_df_spread_tour_adj)[-1])
- coef_df_tour_adj <- data.frame(
- Feature = feature_names_tour_adj,
- Coefficient = coef_vector_tour_adj,
- stringsAsFactors = FALSE
- )
- coef_df_tour_adj <- coef_df_tour_adj %>% left_join(eligible_songs, by = c('Feature' = 'song'))
- View(coef_df_tour_adj)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### [plot] top/bottom tours: tour-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- top_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
- filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
- top_n(20, Coefficient) %>%
- ggplot(aes(x = reorder(Feature, Coefficient), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'dodgerblue') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = 0.0075,
- size = 2.75, fontface = 'bold', color = 'dodgerblue') +
- theme_rs2 +
- labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Top 20 Most Impactful Tours | Tour-Adjusted Model')
- ggsave('top_20_tours_tour_adj_plot.png', top_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)
- bottom_20_tours_tour_adj_plot <- coef_df_tour_adj %>%
- filter(Feature != 'Intercept', Feature %in% eligible_tours$tour_name) %>%
- top_n(20, desc(Coefficient)) %>%
- ggplot(aes(x = reorder(Feature, desc(Coefficient)), y = Coefficient)) +
- coord_flip() +
- geom_col(fill = 'red') +
- geom_text(aes(label = scales::comma(Coefficient, accuracy = .01)), hjust = 0.5, nudge_y = -0.0095,
- size = 2.75, fontface = 'bold', color = 'red') +
- theme_rs2 +
- labs(y = 'Tour RAPM', x = '', title = 'Phish Setlist Rating Plus-Minus: Tours', subtitle = 'Bottom 20 Most Impactful Tours | Tour-Adjusted Model')
- ggsave('bottom_20_tours_tour_adj_plot.png', bottom_20_tours_tour_adj_plot, dpi = 700, height = 4.5, width = 8.5)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### [plot] ratings for most played songs: tour-adjusted model ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- most_played_tour_adj_plot <- coef_df_tour_adj %>%
- filter(Feature != 'Intercept') %>%
- top_n(40, n_shows) %>%
- dplyr::select(-avg_rating) %>%
- gather(metric, value, -Feature) %>%
- mutate(metric = ifelse(metric == 'Coefficient', 'Plus-Minus', 'Number of Shows')) %>%
- ggplot(aes(x = reorder(Feature, value, max), y = value, fill = metric)) +
- facet_wrap(~metric, scales = 'free_x') +
- coord_flip() +
- geom_col() +
- geom_text(aes(
- color = metric,
- label = ifelse(value > 1, scales::comma(value, accuracy = 1), scales::comma(value, accuracy = .01)),
- hjust = ifelse(value > 0, -0.1, 1.1)
- ),
- size = 2.75, fontface = 'bold') +
- scale_y_continuous(expand = expansion(mult = c(.1, .1))) +
- scale_color_manual(values = c('#ff8d1e', 'dodgerblue')) +
- scale_fill_manual(values = c('#ff8d1e', 'dodgerblue')) +
- theme_rs3 +
- theme(legend.position = 'none') +
- labs(y = '', x = '', title = 'Phish Setlist Rating Plus-Minus: Songs', subtitle = 'Top 40 Most Played Songs | Tour-Adjusted Model')
- ggsave('most_played_tour_adj_plot.png', most_played_tour_adj_plot, dpi = 700, height = 7.5, width = 8)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### comparing models ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- coef_df_vanilla %>%
- dplyr::select(Feature, n_shows, vanilla = Coefficient) %>%
- inner_join(y = coef_df_yr_adj %>% dplyr::select(Feature, yr_adjusted = Coefficient), by = 'Feature') %>%
- mutate(abs_diff = abs(vanilla-yr_adjusted))
- combined_df %>% filter(song == 'Fluffhead') %>% group_by(yr) %>% summarise(n_shows = n_distinct(showid), avg_rating = mean(rating, na.rm = TRUE))
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### comparing to average rating ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- coef_df_tour_adj %>%
- filter(n_shows > 0) %>%
- mutate(pctile_coef = percent_rank(Coefficient), pctile_avg_rating = percent_rank(avg_rating)) %>%
- mutate(pctile_diff = abs(pctile_coef - pctile_avg_rating))
- cor(x = coef_df_tour_adj$Coefficient, y = coef_df_tour_adj$avg_rating, use = 'complete.obs')^2
- avg_rating_vs_plus_minus_plot <- coef_df_tour_adj %>%
- filter(Feature != 'Intercept', n_shows > 25) %>%
- mutate(lab = ifelse(avg_rating < 3.6 | avg_rating > 4.1 | Coefficient > 0.08 | Coefficient < -0.08, Feature, NA)) %>%
- ggplot(aes(x = avg_rating, y = Coefficient)) +
- geom_point(color = 'dodgerblue') +
- geom_text_repel(aes(label = lab), size = 3) +
- theme_rs3 +
- scale_y_continuous(limits = c(-0.2, 0.2)) +
- labs(x = 'Average Rating For Shows Where This Song Is Played', y = 'Plus-Minus Contribution',
- caption = 'Songs with 25+ Appearances',
- title = 'Average Rating vs. Plus-Minus Contribution')
- ggsave('avg_rating_vs_plus_minus_plot.png', avg_rating_vs_plus_minus_plot, dpi = 700, height = 6, width = 7)
- fluffhead_yrs <- combined_df %>%
- select(showid, show_date, song, rating, yr) %>%
- distinct() %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date) %>%
- mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
- group_by(yr, fluffhead_played) %>%
- summarise(avg_rating = mean(rating, na.rm = TRUE), n_shows = n()) %>%
- dplyr::select(yr, fluffhead_played, n_shows) %>%
- spread(key = fluffhead_played, value = n_shows) %>%
- filter(No > 3, Yes > 3)
- fluffhead_plot <- combined_df %>%
- select(showid, show_date, song, rating, yr) %>%
- distinct() %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date) %>%
- mutate(fluffhead_played = ifelse(Fluffhead == 1, 'Yes', 'No')) %>%
- group_by(yr, fluffhead_played) %>%
- summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
- filter(yr %in% fluffhead_yrs$yr) %>%
- ggplot(aes(x = as.factor(yr), y = avg_rating, color = as.factor(fluffhead_played))) +
- coord_flip() +
- geom_col(position = 'dodge', width = 0.8) +
- scale_fill_manual(values = c('red', 'dodgerblue'), name = 'Was Fluffhead Played?') +
- theme_rs2 +
- theme(legend.position = 'bottom') +
- labs(x = '', y = 'Average Rating',
- subtitle = 'Years with 3+ Fluffhead Appearances and Non-Appearances',
- title = 'Average Show Rating by Year: With or Without Fluffhead')
- ggsave('fluffhead_plot.png', fluffhead_plot, dpi = 700, height = 6, width = 7.5)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### test predictiveness (vanilla) ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ## spread ##
- combined_df_spread_vanilla <- combined_df %>%
- select(showid, show_date, song, rating) %>%
- distinct() %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date)
- ## fit model ##
- index <- createDataPartition(combined_df_spread_vanilla$rating, p = 0.8, list = FALSE)
- train_data <- combined_df_spread_vanilla[index, ]
- test_data <- combined_df_spread_vanilla[-index, ]
- x_train <- as.matrix(train_data[, -1])
- y_train <- train_data$rating
- x_test <- as.matrix(test_data[, -1])
- y_test <- test_data$rating
- lambda_values_vanilla_oos <- 10^seq(3, -2, length = 500)
- ridge_model_vanilla_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE)
- cv_ridge_vanilla_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_vanilla_oos, standardize = FALSE, nfolds = 250)
- best_lambda_vanilla_oos <- cv_ridge_vanilla_oos$lambda.min
- coefficients_vanilla_oos <- coef(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos)
- coef_vector_vanilla_oos <- as.vector(coefficients_vanilla_oos)
- feature_names_vanilla_oos <- c("Intercept", colnames(train_data)[-1])
- coef_df_vanilla_oos <- data.frame(
- Feature = feature_names_vanilla_oos,
- Coefficient = coef_vector_vanilla_oos,
- stringsAsFactors = FALSE
- )
- coef_df_vanilla_oos <- coef_df_vanilla_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))
- y_pred <- predict(ridge_model_vanilla_oos, s = best_lambda_vanilla_oos, newx = x_test)
- # Calculate RMSE
- rmse <- sqrt(mean((y_test - y_pred)^2))
- cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")
- # Calculate R-squared
- rss <- sum((y_test - y_pred)^2)
- tss <- sum((y_test - mean(y_test))^2)
- r_squared <- 1 - (rss / tss)
- cat("R-squared on Test Set: ", r_squared, "\n")
- # Create a dataframe for actual vs predicted values
- pred_vs_actual_df <- data.frame(
- Actual = y_test,
- Predicted = as.vector(y_pred)
- )
- # Plot predicted vs actual values
- vanilla_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
- geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
- geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
- labs(
- title = "Predicted vs Actual Ratings",
- subtitle = 'Vanilla Model | Out-of-Sample Prediction | 1 point = 1 show',
- x = "Predicted Rating",
- y = "Actual Rating"
- ) +
- theme_rs3 +
- scale_x_continuous(limits = c(1, 5)) +
- scale_y_continuous(limits = c(1, 5))
- ggsave('vanilla_pred_vs_actual_plot.png', vanilla_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### test predictiveness (tour-adjusted) ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- ## eligible tours ##
- eligible_tours <- shows_data %>%
- dplyr::select(showid, tour_name) %>%
- group_by(tour_name) %>%
- summarise(n_shows = n_distinct(showid)) %>%
- filter(n_shows >= 6)
- ## spread ##
- combined_df_spread_tour_adj <- combined_df %>%
- select(showid, show_date, song, rating) %>%
- left_join(
- y = shows_data %>% dplyr::select(showid, tour_name),
- by = 'showid'
- ) %>%
- distinct() %>%
- filter(tour_name %in% eligible_tours$tour_name) %>%
- mutate(val = 1) %>%
- spread(key = song, value = val, fill = 0) %>%
- mutate(val = 1) %>%
- spread(key = tour_name, value = val, fill = 0) %>%
- dplyr::select(-showid, -show_date)
- ## fit model ##
- index <- createDataPartition(combined_df_spread_tour_adj$rating, p = 0.8, list = FALSE)
- train_data <- combined_df_spread_tour_adj[index, ]
- test_data <- combined_df_spread_tour_adj[-index, ]
- x_train <- as.matrix(train_data[, -1])
- y_train <- train_data$rating
- x_test <- as.matrix(test_data[, -1])
- y_test <- test_data$rating
- lambda_values_tour_adj_oos <- 10^seq(3, -2, length = 500)
- ridge_model_tour_adj_oos <- glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE)
- cv_ridge_tour_adj_oos <- cv.glmnet(x_train, y_train, alpha = 0, lambda = lambda_values_tour_adj_oos, standardize = FALSE, nfolds = 250)
- best_lambda_tour_adj_oos <- cv_ridge_tour_adj_oos$lambda.min
- coefficients_tour_adj_oos <- coef(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos)
- coef_vector_tour_adj_oos <- as.vector(coefficients_tour_adj_oos)
- feature_names_tour_adj_oos <- c("Intercept", colnames(train_data)[-1])
- coef_df_tour_adj_oos <- data.frame(
- Feature = feature_names_tour_adj_oos,
- Coefficient = coef_vector_tour_adj_oos,
- stringsAsFactors = FALSE
- )
- coef_df_tour_adj_oos <- coef_df_tour_adj_oos %>% left_join(eligible_songs, by = c('Feature' = 'song'))
- y_pred <- predict(ridge_model_tour_adj_oos, s = best_lambda_tour_adj_oos, newx = x_test)
- # Calculate RMSE
- rmse <- sqrt(mean((y_test - y_pred)^2))
- cat("Root Mean Squared Error (RMSE) on Test Set: ", rmse, "\n")
- # Calculate R-squared
- rss <- sum((y_test - y_pred)^2)
- tss <- sum((y_test - mean(y_test))^2)
- r_squared <- 1 - (rss / tss)
- cat("R-squared on Test Set: ", r_squared, "\n")
- # Create a dataframe for actual vs predicted values
- pred_vs_actual_df <- data.frame(
- Actual = y_test,
- Predicted = as.vector(y_pred)
- )
- # Plot predicted vs actual values
- tour_adj_pred_vs_actual_plot <- ggplot(pred_vs_actual_df, aes(x = Predicted, y = Actual)) +
- geom_point(color = 'dodgerblue', size = 2.5, alpha = 0.9) +
- geom_abline(intercept = 0, slope = 1, color = 'grey', linetype = 'dashed') +
- labs(
- title = "Predicted vs Actual Ratings",
- subtitle = 'Tour-Adjusted Model | Out-of-Sample Prediction | 1 point = 1 show',
- x = "Predicted Rating",
- y = "Actual Rating"
- ) +
- theme_rs3 +
- scale_x_continuous(limits = c(1, 5)) +
- scale_y_continuous(limits = c(1, 5))
- ggsave('tour_adj_pred_vs_actual_plot.png', tour_adj_pred_vs_actual_plot, dpi = 700, height = 6, width = 7.5)
- ### --- --- --- --- --- --- --- --- --- --- ---
- #### combine all 3 coefficient flavors ####
- ### --- --- --- --- --- --- --- --- --- --- ---
- clipr::write_clip(
- coef_df_vanilla %>%
- filter(Feature != 'Intercept') %>%
- dplyr::select(Feature, n_shows, vanilla_plus_minus = Coefficient) %>%
- left_join(y = coef_df_yr_adj %>%
- filter(Feature != 'Intercept', n_shows > 0) %>%
- dplyr::select(Feature, yr_adj_plus_minus = Coefficient), by = 'Feature') %>%
- left_join(y = coef_df_tour_adj %>%
- filter(Feature != 'Intercept', n_shows > 0) %>%
- dplyr::select(Feature, tour_adj_plus_minus = Coefficient), by = 'Feature')
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement