Movie Jump Scares

###################################################################################################
# Visualizing jump scares in scary movies rated by wheresthejump.com
# The average 'jumpiness' of scary movies increases as the movie goes on
# before dropping off steeply at the end.
# Created by: reddit.com/user/brianhaas19

# Link to data:
# https://wheresthejump.com/full-movie-list/

### Setup
library(tidyverse)
library(rvest)
library(lubridate)

### Read data

# Read in table of movies and related info:
url <- "https://wheresthejump.com/full-movie-list/"
xpath1 <- '//*[@id="post-8925"]/div/div/table'
table <- url %>%
  read_html() %>%
  html_nodes(xpath = xpath1) %>%
  html_table() %>%
  .[[1]]

# Each movie in the table has a link to a page containing timelines for the jump scares in the movie.
# The movie runtime is also on this page.
# Gather these links (716 in total):
link_urls <- url %>%
  read_html() %>%
  html_nodes(css = 'td a') %>%
  html_attr("href")

# Get the `runtime`s, `jump_time`s and `major_jump`s from the above links:

# Get the text related to jump times from each link:
get_jump_time_text <- function(text) {
  # Extract all <p>'s:
  p <- text %>%
    html_nodes(css = 'p') %>%
    as.character() # convert to character vector

  # Indentify <p>'s that are jump times:
  jump_times <- p %>%
    str_detect("spoilerdescription") # All jump times contain this class
  return(p[jump_times])
}

# Extract a vector of jump times from the link's HTML text:
extract_jump_times <- function(text) {
  jump_times <- text %>%
    str_extract("[0-9]{2}:[0-9]{2}:[0-9]{2}")
  return(jump_times)
}

# Extract a vector of logicals indicating major jump scares from the link's HTML text:
extract_major_jumps <- function(text) {
  major_jumps <- text %>%
    str_detect("strong")
  return(major_jumps)
}

# Extract the movie's runtime from the link's HTML text:
get_runtime <- function(text) {
  # Extract all <p>'s:
  text <- text %>%
    html_nodes(css = 'p') %>%
    as.character() # convert to character vector

  # Detect the runtime line:
  runtime_line <- text %>%
    str_detect("Runtime:")

  # Extract the runtime:
  runtime <- text[runtime_line] %>%
    str_extract("[0-9]{1,}") %>%
    as.numeric()

  return(runtime)
}

# Get the HTML text from each link:
link_text <- map(link_urls, read_html) # Potentially slow! 716 links passed to read_html().

# Pass the HTML text from each link to functions to extract relevant info:
jump_time_text <- map(link_text, get_jump_time_text) # HTML from each movie's link
jump_time <- map(jump_time_text, extract_jump_times) # list of actual jump times for each movie
major_jump <- map(jump_time_text, extract_major_jumps) # list of logicals indicating major jump scares
runtime <- map_dbl(link_text, get_runtime) # vector of runtimes (pass in the full link text, rather than `jump_time_text`)


### Build the table

# Start building a `jumps` table which will contain the movie info and jump scare times.

# First we select only required columns from `table`, clean up the names, and attach the `runtimes`.
# Clean names and select the most important columns:
jumps <- table %>%
  rename(name = "Movie Name",
         year = Year,
         jump_count = "Jump Count",
         jump_scare_rating = "Jump Scare Rating") %>%
  mutate(id = row_number(),
         runtime = runtime) %>%
  filter(jump_count > 0) %>% # exclude movies with no jumps
  select(id, name, year, runtime, jump_count, jump_scare_rating)
num_movies <- nrow(jumps)

# In general there is more than one jump scare in each movie. We need the data in long format
# so we can add the `jump_time`s and `major_jump`s to the table. We can use `expand()` to do this.
# Once the table is expanded, add the `jump_time` and `major_jump` columns:
jumps <- jumps %>%
  mutate(i = jump_count) %>% # `i` is an index variable for expanding
  group_by(id, name, year, runtime, jump_count, jump_scare_rating) %>%
  expand(i = seq(1:i)) %>%
  ungroup %>%
  mutate(jump_time = unlist(jump_time),
         major_jump = unlist(major_jump)) %>%
  rename(jump_number = i)

# Continue cleaning. The `runtime` and `jump_time` columns are in different formats.
# Convert them both to seconds so that they are compatible. Then add a `jump_location`
# column which indicates the point in the movie where the jump scare occurs
# (`0` = start of movie, `50` = middle of movie, `100` = end of movie).
# Calculate the `jump_location` by dividing the `jump_time` by the `runtime`:
jumps <- jumps %>%
  mutate(jump_time = period_to_seconds(hms(jump_time)),
         runtime = 60*runtime,
         jump_location = 100*round(jump_time/runtime, 2)) %>%
  mutate(jump_location = as.integer(jump_location))

# Continue cleaning. If we inspect the `jump_location` column we see some anomalies:
jumps$jump_location[which(jumps$jump_location > 100)]

# There are a number of jump scares with a `jump_location` greater than 100 which should be impossible
# based on how we defined the variable. There are three bad points with values over 1,000 which we will remove.
# For the others which are close to 100, we will set the value to 100. These values may have occured due a
# discrepancy between the runtime and a jump scare occurring at the very end of the movie:
jumps <- jumps %>%
  filter(jump_location <= 1000) %>% # remove 3 bad `jump_location` values
  mutate(jump_location = ifelse(jump_location <= 100, jump_location, 100)) # adjust the remaining anomalies down to 100 (i.e. end of movie)

# Save the cleaned `jumps` table to disk for ease of access later:
# saveRDS(jumps, str_c(getwd(), "/jumps_cleaned.rds"))
# jumps <- readRDS(str_c(getwd(), "/jumps_cleaned.rds"))


### Visualise the data

# First, construct a `timeline` data frame which consists of 100 time `segment`s for each movie.
# Plot each movie as a straight line, with the movies ordered by "scariness" on the y-axis (scariest at the top).
# Then, whenever a jump occurs in the movie, add a jump to the line for that movie. If it's a major jump
# add a bigger jump to the line:

# Order the movies by scariness and save the order of the `id`s:
ids <- jumps %>%
mutate(jump_scare_rating_adj = jump_scare_rating * jump_count) %>%
arrange(jump_scare_rating_adj, jump_scare_rating, jump_count) %>%
select(id) %>%
unlist() %>%
unique()

# Add a trend line for the average amount of jump scares over time:
n <- 8 # number of cuts
a = 100/n # parameter for getting sequence of midpoints
jumps_cut <- jumps %>% # table of average amount of jump scares at each point
mutate(jump_location_cut = cut_interval(jump_location, n)) %>%
group_by(jump_location_cut) %>%
summarise(average_jump_count = sum(jump_count)/num_movies) %>%
mutate(x = seq(a/2, 100 - a/2, by = 100/n),
y = scale(average_jump_count, center = TRUE)*100 + length(unique(jumps$id))/2)

# Build the timeline for each movie and add the jump scares:
timeline <- tibble(
id = ids
) %>%
mutate(count = 101, # 100 + 1, as we start at zero
scare_rank = row_number()) %>%
uncount(count) %>%  # Ref: https://stackoverflow.com/a/55492365
mutate(segment = rep(0:100, length(unique(jumps$id)))) %>%
left_join(jumps, by = c("id", "segment" = "jump_location")) %>%
select(id, scare_rank, segment, jump_scare_rating, jump_number, major_jump) %>%
mutate(jump_intensity = ifelse(is.na(major_jump), scare_rank, ifelse(major_jump, scare_rank + 20, scare_rank + 10))) # +20 for major_jump, +10 for !major_jump, 0 for NA

# Plot:
# Recommended figure size if using R Notebook: fig.width=12, fig.height = 8
g <- ggplot(timeline) +
  geom_line(aes(segment, jump_intensity, group = scare_rank), # movie timelines with jumps where appropriate
            size = 0.5, alpha = 0.2, color = "white",
            show.legend = FALSE) +
  geom_point(data = jumps_cut, # points for the trend line
            aes(x, y, color = "Average 'jumpiness' over time")) +
  geom_line(data = jumps_cut, # trend line for average number of jump scares over time
            aes(x, y, color = "Average 'jumpiness' over time"),
            size = 2,
            alpha = 0.5) +
  labs(title = expression(paste("Visualizing jump scares in 666 scary movies rated by ",
                                italic("wheresthejump.com"))),
       subtitle = "The average 'jumpiness' of scary movies increases as the movie goes on before dropping off steeply at the end.",
       caption = expression(paste("Created by: ",
                            italic("reddit.com/user/brianhaas19"), "\tData source: ",
                            italic("https://wheresthejump.com/full-movie-list/")))) +
  scale_x_continuous(name = NULL,
                     breaks = c(2, 50, 98),
                     labels = c("Start", "Middle of Movie", "End"),
                     expand = c(0, 0)) +
  scale_y_continuous(name = "Movie Scariness",
                     breaks = c(10, 325, 640), # positioning of the labels
                     labels = c("Not Scary", "Scary", "Very Scary!"),
                     expand = c(0, 0)) +
  scale_color_manual(name = NULL, values = c("Average 'jumpiness' over time" = "white")) +
  theme_bw() +
  theme(plot.background = element_rect(fill = "#222222"), # grey taken from wheresthejump.com
        panel.background = element_rect(fill = "#410000"), # red taken from wheresthejump.com
        panel.grid = element_blank(),
        plot.title = element_text(color = "white", size = 14),
        plot.subtitle = element_text(color = "white", size = 12),
        plot.caption = element_text(color = "white", size = 10, hjust = 0),
        axis.title = element_text(color = "white", size = 12),
        axis.text = element_text(color = "white", size = 10),
        axis.line = element_line(color = "white"),
        legend.position = "bottom",
        legend.background = element_rect(fill = "#410000"),
        legend.key = element_rect(fill = "#410000"),
        legend.text = element_text(color = "white", size = 12),
        legend.key.width = unit(2, "cm"))
g
#ggsave(file = str_c(getwd(), "/movie_jump_scares2.png"), plot = g, width = 12, height = 8) # uncomment to save to disk in current working directory
###################################################################################################