rsmith_spotify

#### Load data ####

playlist_summary_external <- read.delim2("~/Desktop/playlist_summary_external.txt", stringsAsFactors = FALSE)

#### Calculate success metrics ####

spotify_playlists <- playlist_summary_external %>%
  filter(owner == 'spotify', monthly_stream30s >= 5000) %>%
  mutate(
    pct_skippers = skippers/users,
    dau_mau_ratio = dau/mau,
    wau_mau_ratio = wau/mau,
    streams30s_per_mau = monthly_stream30s/mau,
    streams30s_per_dau = stream30s/dau,
    skips = streams - stream30s,
    skip_rate = ifelse(dau < 50, NA, skips/streams),
    mau_growth_mom = mau/mau_previous_month - 1,
    mom_retention = ifelse(mau_previous_month > 50, mau_both_months/mau_previous_month, NA)
  )

#### How does artist diversity correlate with success metrics? ####

artist_diversity <- spotify_playlists %>%
  filter(n_artists > 2) %>%
  mutate(
    tracks_per_artist = n_tracks/n_artists,
    artist_diversity = 1-((n_tracks/n_artists)/(sqrt(n_tracks)/2))
  ) %>%
  mutate(
    `MoM Retention` = mom_retention,
    `N 30s Streams Per MAU` = streams30s_per_mau,
    `Skip Rate` = skip_rate
  )
artist_diversity %>%
  select(artist_diversity, `Skip Rate`, `MoM Retention`, `N 30s Streams Per MAU`) %>%
  gather(key, value, -artist_diversity) %>%
  ggplot(aes(x = artist_diversity, y = value)) +
  facet_wrap(~key, scales = 'free_y') +
  geom_point(alpha = 0.6) +
  geom_smooth(span = 0.999) +
  scale_x_continuous(breaks = seq(0.2, 0.8, 0.2), limits = c(0.1, 0.9)) +
  labs(x = 'Artist Diversity', y = '',
       title = 'Playlist artist diversity vs. Success metrics',
       subtitle = 'Spotify-owned playlists, > 5,000 30s streams') +
  theme_classic() + theme(strip.text.x = element_text(size = 10.5))

#### Does the number of tracks in a playlist influence success metrics? ####

spotify_playlists %>%
  filter(n_tracks <= 1000) %>%
  mutate(
    `MoM Retention` = mom_retention,
    `N 30s Streams Per MAU` = streams30s_per_mau,
    `Skip Rate` = skip_rate
  ) %>%
  select(n_tracks, `Skip Rate`, `MoM Retention`, `N 30s Streams Per MAU`) %>%
  gather(key, value, -n_tracks) %>%
  ggplot(aes(x = n_tracks, y = value)) +
  geom_point() +
  geom_smooth(span = 0.999) +
  facet_wrap(~key, scales = 'free_y') +
  scale_x_log10() +
  theme_classic() +
  labs(x = 'Number of tracks on playlist', y = '',
       title = 'Playlist length vs. Success metrics', subtitle = 'Spotify-owned playlists, > 5,000 30s streams')

#### Do particular moods correlate with success metrics? ####

spotify_playlists_mood <- spotify_playlists %>%
  select(mood_1, mood_2, mood_3,
         streams, stream30s, dau, mau,
         dau_mau_ratio, streams30s_per_mau, skip_rate, mom_retention) %>%
  gather(key = col, value = mood, -streams, -stream30s, -dau, -mau, -dau_mau_ratio, -streams30s_per_mau, -skip_rate, -mom_retention) %>%
  select(-col) %>%
  filter(!mood %in% c('-', 'Other')) %>%
  mutate(mood = reorder(mood, -streams30s_per_mau, mean, na.rm = TRUE))

spotify_playlists_mood_avg <- spotify_playlists_mood %>%
  group_by(mood) %>%
  summarise(
    avg = mean(streams30s_per_mau, na.rm = TRUE),
    median = median(streams30s_per_mau, na.rm = TRUE)
  ) %>%
  ungroup()

ggplot(spotify_playlists_mood, aes(x = streams30s_per_mau, fill = mood)) +
  geom_density() +
  facet_wrap(~mood) +
  geom_vline(xintercept = mean(spotify_playlists$streams30s_per_mau, na.rm = TRUE)) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal() + theme(strip.text.x = element_text(size = 10.5)) +
  geom_text(data = spotify_playlists_mood_avg, size = 3.25, fontface = 'bold', aes(x = 5, y = .135, label = scales::comma(round(avg, 1)))) +
  labs(x = '30s streams per MAU', y = '', title = 'Distribution of 30s streams per MAU, by mood')

###

spotify_playlists_mood %>%
  group_by(mood) %>%
  summarise(
    avg_retention = mean(mom_retention, na.rm = TRUE),
    avg_skip_rate = mean(skip_rate, na.rm = TRUE),
    avg_streams30s_per_mau = mean(streams30s_per_mau, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  mutate(
    retention_rk = min_rank(-avg_retention),
    skip_rk = min_rank(avg_skip_rate),
    streams30s_per_mau_rk = min_rank(-avg_streams30s_per_mau),
    avg_rk = (retention_rk + skip_rk + streams30s_per_mau_rk)/3
  ) %>%
  arrange(-avg_rk)

View(spotify_playlists %>%
       filter(
         mood_1 == 'Sentimental' | mood_2 == 'Sentimental' | mood_3 == 'Sentimental',
         mom_retention > 0.16, streams30s_per_dau > 10, skip_rate < 0.16
         )
     )

View(spotify_playlists %>%
       filter(
         mood_1 == 'Urgent' | mood_2 == 'Urgent' | mood_3 == 'Urgent',
         mom_retention < 0.14, streams30s_per_dau < 9, skip_rate > 0.3
       )
)

mood_for_lm <- spotify_playlists_mood %>%
  select(mood, skip_rate) %>%
  mutate(val = 1, id = seq(1, nrow(spotify_playlists_mood), 1)) %>%
  spread(key = mood, value = val, fill = 0) %>%
  select(-id)

summary(lm(skip_rate ~ ., data = mood_for_lm))

#### Do particular genres correlate with success metrics? ####

spotify_playlists_genre <- spotify_playlists %>%
  top_n(50, mau) %>%
  select(genre_1, genre_2, genre_3,
         streams, stream30s, dau, mau,
         dau_mau_ratio, streams30s_per_dau, skip_rate, mom_retention) %>%
  gather(key = col, value = genre, -streams, -stream30s, -dau, -mau, -dau_mau_ratio, -streams30s_per_dau, -skip_rate, -mom_retention) %>%
  select(-col) %>%
  filter(!genre %in% c('-', 'Other')) %>%
  mutate(genre = reorder(genre, -mau, mean, na.rm = TRUE))

spotify_playlists_genre %>%
  group_by(genre) %>%
  summarise(
    n = n(),
    `Average MAU Per Playlist (000s)` = mean(mau, na.rm = TRUE)/1000
    ) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(genre, n), y = n, fill = `Average MAU Per Playlist (000s)`)) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  scale_fill_gradient2(low = '#CD5C5C', mid = '#F0E68C', high = '#32CD32', midpoint = mean(spotify_playlists_genre$mau)/1000) +
  theme_minimal() + theme(legend.position = 'bottom') +
  labs(x = '', y = 'Number of playlists', title = 'Top 50 Playlists by MAU: Most Popular Genres',
       subtitle = 'Spotify-owned playlists')

View(spotify_playlists %>%
       top_n(50, mau) %>%
       filter(
         genre_1 == 'Rap' | genre_2 == 'Rap' | genre_3 == 'Rap'
       )
)

genre_for_lm <- spotify_playlists %>%
  select(genre_1, genre_2, genre_3,
         streams, stream30s, dau, mau,
         dau_mau_ratio, streams30s_per_dau, skip_rate, mom_retention) %>%
  gather(key = col, value = genre, -streams, -stream30s, -dau, -mau, -dau_mau_ratio, -streams30s_per_dau, -skip_rate, -mom_retention) %>%
  select(-col) %>%
  filter(!genre %in% c('-', 'Other')) %>%
  mutate(genre = reorder(genre, -mau, mean, na.rm = TRUE)) %>%
  select(genre, mau) %>%
  mutate(val = 1, id = seq(1, nrow(.), 1)) %>%
  spread(key = genre, value = val, fill = 0) %>%
  select(-id)

summary(lm(mau ~ ., data = genre_for_lm))

####

spotify_playlists_genre2 <- spotify_playlists %>%
  select(genre_1, genre_2, genre_3, monthly_stream30s) %>%
  gather(key = col, value = genre, -monthly_stream30s) %>%
  select(-col) %>%
  filter(!genre %in% c('-', 'Other')) %>%
  group_by(genre) %>%
  summarise(monthly_stream30s = sum(monthly_stream30s, na.rm = TRUE))

ggplot(spotify_playlists_genre2, aes(x = reorder(genre, monthly_stream30s), y = monthly_stream30s)) +
  geom_bar(stat = 'identity', fill = 'steelblue4') +
  coord_flip() +
  theme_minimal() +
  scale_y_continuous(labels = scales::comma) +
  labs(x = '', y = 'Monthly 30s Streams',
       title = 'Monthly 30s Streams by Genre',
       subtitle = 'Spotify-owned playlists, > 5,000 30s streams')

#### Do particular keywords in a playlist title influence success metrics? ####

tokens <- spotify_playlists %>%
  select(mau, tokens) %>%
  mutate(
    tokens = str_replace_all(tokens, "\\[|\\]", ""),
    tokens = strsplit(as.character(tokens), ", ")
    ) %>%
  unnest(tokens)

average_mau_by_token <- tokens %>%
  group_by(tokens) %>%
  summarise(
    avg_mau = mean(mau),
    n = n()
    ) %>%
  ungroup() %>%
  filter(n > 2) %>%
  arrange(-avg_mau)

average_mau_by_token %>%
  top_n(15, avg_mau) %>%
  ggplot(aes(x = reorder(tokens, avg_mau), y = avg_mau)) +
  geom_bar(stat = 'identity', fill = 'steelblue4') +
  coord_flip() +
  theme_minimal() +
  scale_y_continuous(labels = scales::comma, limits = c(0, 525000)) +
  labs(x = '', y = 'Average MAU per playlist',
       title = 'Playlist titles: Average MAU per playlist by token',
       subtitle = 'Spotify-owned playlists, > 5,000 30s streams')

####

tokens %>%
  left_join(y = sentiments, by = c('tokens' = 'word')) %>%
  group_by(sentiment) %>%
  summarise(
    avg_mau = mean(mau),
    n = n()
  ) %>%
  ungroup() %>%
  filter(n > 2) %>%
  arrange(-avg_mau)

####

spotify_playlists %>%
  filter(stream30s > 1000) %>%
  ggplot(aes(x = pct_skippers, y = skip_rate)) +
  geom_point() + geom_smooth() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  labs(
    x = 'skippers/users', y = '(streams - stream30s)/streams',
    title = '% Skippers vs Skip Rate'
  )