Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################################################################
- # Visualizing jump scares in scary movies rated by wheresthejump.com
- # The average 'jumpiness' of scary movies increases as the movie goes on
- # before dropping off steeply at the end.
- # Created by: reddit.com/user/brianhaas19
- # Link to data:
- # https://wheresthejump.com/full-movie-list/
- ### Setup
- library(tidyverse)
- library(rvest)
- library(lubridate)
- ### Read data
- # Read in table of movies and related info:
- url <- "https://wheresthejump.com/full-movie-list/"
- xpath1 <- '//*[@id="post-8925"]/div/div/table'
- table <- url %>%
- read_html() %>%
- html_nodes(xpath = xpath1) %>%
- html_table() %>%
- .[[1]]
- # Each movie in the table has a link to a page containing timelines for the jump scares in the movie.
- # The movie runtime is also on this page.
- # Gather these links (716 in total):
- link_urls <- url %>%
- read_html() %>%
- html_nodes(css = 'td a') %>%
- html_attr("href")
- # Get the `runtime`s, `jump_time`s and `major_jump`s from the above links:
- # Get the text related to jump times from each link:
- get_jump_time_text <- function(text) {
- # Extract all <p>'s:
- p <- text %>%
- html_nodes(css = 'p') %>%
- as.character() # convert to character vector
- # Indentify <p>'s that are jump times:
- jump_times <- p %>%
- str_detect("spoilerdescription") # All jump times contain this class
- return(p[jump_times])
- }
- # Extract a vector of jump times from the link's HTML text:
- extract_jump_times <- function(text) {
- jump_times <- text %>%
- str_extract("[0-9]{2}:[0-9]{2}:[0-9]{2}")
- return(jump_times)
- }
- # Extract a vector of logicals indicating major jump scares from the link's HTML text:
- extract_major_jumps <- function(text) {
- major_jumps <- text %>%
- str_detect("strong")
- return(major_jumps)
- }
- # Extract the movie's runtime from the link's HTML text:
- get_runtime <- function(text) {
- # Extract all <p>'s:
- text <- text %>%
- html_nodes(css = 'p') %>%
- as.character() # convert to character vector
- # Detect the runtime line:
- runtime_line <- text %>%
- str_detect("Runtime:")
- # Extract the runtime:
- runtime <- text[runtime_line] %>%
- str_extract("[0-9]{1,}") %>%
- as.numeric()
- return(runtime)
- }
- # Get the HTML text from each link:
- link_text <- map(link_urls, read_html) # Potentially slow! 716 links passed to read_html().
- # Pass the HTML text from each link to functions to extract relevant info:
- jump_time_text <- map(link_text, get_jump_time_text) # HTML from each movie's link
- jump_time <- map(jump_time_text, extract_jump_times) # list of actual jump times for each movie
- major_jump <- map(jump_time_text, extract_major_jumps) # list of logicals indicating major jump scares
- runtime <- map_dbl(link_text, get_runtime) # vector of runtimes (pass in the full link text, rather than `jump_time_text`)
- ### Build the table
- # Start building a `jumps` table which will contain the movie info and jump scare times.
- # First we select only required columns from `table`, clean up the names, and attach the `runtimes`.
- # Clean names and select the most important columns:
- jumps <- table %>%
- rename(name = "Movie Name",
- year = Year,
- jump_count = "Jump Count",
- jump_scare_rating = "Jump Scare Rating") %>%
- mutate(id = row_number(),
- runtime = runtime) %>%
- filter(jump_count > 0) %>% # exclude movies with no jumps
- select(id, name, year, runtime, jump_count, jump_scare_rating)
- num_movies <- nrow(jumps)
- # In general there is more than one jump scare in each movie. We need the data in long format
- # so we can add the `jump_time`s and `major_jump`s to the table. We can use `expand()` to do this.
- # Once the table is expanded, add the `jump_time` and `major_jump` columns:
- jumps <- jumps %>%
- mutate(i = jump_count) %>% # `i` is an index variable for expanding
- group_by(id, name, year, runtime, jump_count, jump_scare_rating) %>%
- expand(i = seq(1:i)) %>%
- ungroup %>%
- mutate(jump_time = unlist(jump_time),
- major_jump = unlist(major_jump)) %>%
- rename(jump_number = i)
- # Continue cleaning. The `runtime` and `jump_time` columns are in different formats.
- # Convert them both to seconds so that they are compatible. Then add a `jump_location`
- # column which indicates the point in the movie where the jump scare occurs
- # (`0` = start of movie, `50` = middle of movie, `100` = end of movie).
- # Calculate the `jump_location` by dividing the `jump_time` by the `runtime`:
- jumps <- jumps %>%
- mutate(jump_time = period_to_seconds(hms(jump_time)),
- runtime = 60*runtime,
- jump_location = 100*round(jump_time/runtime, 2)) %>%
- mutate(jump_location = as.integer(jump_location))
- # Continue cleaning. If we inspect the `jump_location` column we see some anomalies:
- jumps$jump_location[which(jumps$jump_location > 100)]
- # There are a number of jump scares with a `jump_location` greater than 100 which should be impossible
- # based on how we defined the variable. There are three bad points with values over 1,000 which we will remove.
- # For the others which are close to 100, we will set the value to 100. These values may have occured due a
- # discrepancy between the runtime and a jump scare occurring at the very end of the movie:
- jumps <- jumps %>%
- filter(jump_location <= 1000) %>% # remove 3 bad `jump_location` values
- mutate(jump_location = ifelse(jump_location <= 100, jump_location, 100)) # adjust the remaining anomalies down to 100 (i.e. end of movie)
- # Save the cleaned `jumps` table to disk for ease of access later:
- # saveRDS(jumps, str_c(getwd(), "/jumps_cleaned.rds"))
- # jumps <- readRDS(str_c(getwd(), "/jumps_cleaned.rds"))
- ### Visualise the data
- # First, construct a `timeline` data frame which consists of 100 time `segment`s for each movie.
- # Plot each movie as a straight line, with the movies ordered by "scariness" on the y-axis (scariest at the top).
- # Then, whenever a jump occurs in the movie, add a jump to the line for that movie. If it's a major jump
- # add a bigger jump to the line:
- # Order the movies by scariness and save the order of the `id`s:
- ids <- jumps %>%
- mutate(jump_scare_rating_adj = jump_scare_rating * jump_count) %>%
- arrange(jump_scare_rating_adj, jump_scare_rating, jump_count) %>%
- select(id) %>%
- unlist() %>%
- unique()
- # Add a trend line for the average amount of jump scares over time:
- n <- 8 # number of cuts
- a = 100/n # parameter for getting sequence of midpoints
- jumps_cut <- jumps %>% # table of average amount of jump scares at each point
- mutate(jump_location_cut = cut_interval(jump_location, n)) %>%
- group_by(jump_location_cut) %>%
- summarise(average_jump_count = sum(jump_count)/num_movies) %>%
- mutate(x = seq(a/2, 100 - a/2, by = 100/n),
- y = scale(average_jump_count, center = TRUE)*100 + length(unique(jumps$id))/2)
- # Build the timeline for each movie and add the jump scares:
- timeline <- tibble(
- id = ids
- ) %>%
- mutate(count = 101, # 100 + 1, as we start at zero
- scare_rank = row_number()) %>%
- uncount(count) %>% # Ref: https://stackoverflow.com/a/55492365
- mutate(segment = rep(0:100, length(unique(jumps$id)))) %>%
- left_join(jumps, by = c("id", "segment" = "jump_location")) %>%
- select(id, scare_rank, segment, jump_scare_rating, jump_number, major_jump) %>%
- mutate(jump_intensity = ifelse(is.na(major_jump), scare_rank, ifelse(major_jump, scare_rank + 20, scare_rank + 10))) # +20 for major_jump, +10 for !major_jump, 0 for NA
- # Plot:
- # Recommended figure size if using R Notebook: fig.width=12, fig.height = 8
- g <- ggplot(timeline) +
- geom_line(aes(segment, jump_intensity, group = scare_rank), # movie timelines with jumps where appropriate
- size = 0.5, alpha = 0.2, color = "white",
- show.legend = FALSE) +
- geom_point(data = jumps_cut, # points for the trend line
- aes(x, y, color = "Average 'jumpiness' over time")) +
- geom_line(data = jumps_cut, # trend line for average number of jump scares over time
- aes(x, y, color = "Average 'jumpiness' over time"),
- size = 2,
- alpha = 0.5) +
- labs(title = expression(paste("Visualizing jump scares in 666 scary movies rated by ",
- italic("wheresthejump.com"))),
- subtitle = "The average 'jumpiness' of scary movies increases as the movie goes on before dropping off steeply at the end.",
- caption = expression(paste("Created by: ",
- italic("reddit.com/user/brianhaas19"), "\tData source: ",
- italic("https://wheresthejump.com/full-movie-list/")))) +
- scale_x_continuous(name = NULL,
- breaks = c(2, 50, 98),
- labels = c("Start", "Middle of Movie", "End"),
- expand = c(0, 0)) +
- scale_y_continuous(name = "Movie Scariness",
- breaks = c(10, 325, 640), # positioning of the labels
- labels = c("Not Scary", "Scary", "Very Scary!"),
- expand = c(0, 0)) +
- scale_color_manual(name = NULL, values = c("Average 'jumpiness' over time" = "white")) +
- theme_bw() +
- theme(plot.background = element_rect(fill = "#222222"), # grey taken from wheresthejump.com
- panel.background = element_rect(fill = "#410000"), # red taken from wheresthejump.com
- panel.grid = element_blank(),
- plot.title = element_text(color = "white", size = 14),
- plot.subtitle = element_text(color = "white", size = 12),
- plot.caption = element_text(color = "white", size = 10, hjust = 0),
- axis.title = element_text(color = "white", size = 12),
- axis.text = element_text(color = "white", size = 10),
- axis.line = element_line(color = "white"),
- legend.position = "bottom",
- legend.background = element_rect(fill = "#410000"),
- legend.key = element_rect(fill = "#410000"),
- legend.text = element_text(color = "white", size = 12),
- legend.key.width = unit(2, "cm"))
- g
- #ggsave(file = str_c(getwd(), "/movie_jump_scares2.png"), plot = g, width = 12, height = 8) # uncomment to save to disk in current working directory
- ###################################################################################################
Add Comment
Please, Sign In to add comment