brianhaas19

Movie Jump Scares

Oct 31st, 2020 (edited)
292
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 10.22 KB | None | 0 0
  1. ###################################################################################################
  2. # Visualizing jump scares in scary movies rated by wheresthejump.com
  3. # The average 'jumpiness' of scary movies increases as the movie goes on
  4. # before dropping off steeply at the end.
  5. # Created by: reddit.com/user/brianhaas19
  6.  
  7. # Link to data:
  8. # https://wheresthejump.com/full-movie-list/
  9.  
  10. ### Setup
  11. library(tidyverse)
  12. library(rvest)
  13. library(lubridate)
  14.  
  15. ### Read data
  16.  
  17. # Read in table of movies and related info:
  18. url <- "https://wheresthejump.com/full-movie-list/"
  19. xpath1 <- '//*[@id="post-8925"]/div/div/table'
  20. table <- url %>%
  21.   read_html() %>%
  22.   html_nodes(xpath = xpath1) %>%
  23.   html_table() %>%
  24.   .[[1]]
  25.  
  26. # Each movie in the table has a link to a page containing timelines for the jump scares in the movie.
  27. # The movie runtime is also on this page.
  28. # Gather these links (716 in total):
  29. link_urls <- url %>%
  30.   read_html() %>%
  31.   html_nodes(css = 'td a') %>%
  32.   html_attr("href")
  33.  
  34. # Get the `runtime`s, `jump_time`s and `major_jump`s from the above links:
  35.  
  36. # Get the text related to jump times from each link:
  37. get_jump_time_text <- function(text) {
  38.   # Extract all <p>'s:
  39.   p <- text %>%
  40.     html_nodes(css = 'p') %>%
  41.     as.character() # convert to character vector
  42.  
  43.   # Indentify <p>'s that are jump times:
  44.   jump_times <- p %>%
  45.     str_detect("spoilerdescription") # All jump times contain this class
  46.   return(p[jump_times])
  47. }
  48.  
  49. # Extract a vector of jump times from the link's HTML text:
  50. extract_jump_times <- function(text) {
  51.   jump_times <- text %>%
  52.     str_extract("[0-9]{2}:[0-9]{2}:[0-9]{2}")
  53.   return(jump_times)
  54. }
  55.  
  56. # Extract a vector of logicals indicating major jump scares from the link's HTML text:
  57. extract_major_jumps <- function(text) {
  58.   major_jumps <- text %>%
  59.     str_detect("strong")
  60.   return(major_jumps)
  61. }
  62.  
  63. # Extract the movie's runtime from the link's HTML text:
  64. get_runtime <- function(text) {
  65.   # Extract all <p>'s:
  66.   text <- text %>%
  67.     html_nodes(css = 'p') %>%
  68.     as.character() # convert to character vector
  69.  
  70.   # Detect the runtime line:
  71.   runtime_line <- text %>%
  72.     str_detect("Runtime:")
  73.  
  74.   # Extract the runtime:
  75.   runtime <- text[runtime_line] %>%
  76.     str_extract("[0-9]{1,}") %>%
  77.     as.numeric()
  78.  
  79.   return(runtime)
  80. }
  81.  
  82. # Get the HTML text from each link:
  83. link_text <- map(link_urls, read_html) # Potentially slow! 716 links passed to read_html().
  84.  
  85. # Pass the HTML text from each link to functions to extract relevant info:
  86. jump_time_text <- map(link_text, get_jump_time_text) # HTML from each movie's link
  87. jump_time <- map(jump_time_text, extract_jump_times) # list of actual jump times for each movie
  88. major_jump <- map(jump_time_text, extract_major_jumps) # list of logicals indicating major jump scares
  89. runtime <- map_dbl(link_text, get_runtime) # vector of runtimes (pass in the full link text, rather than `jump_time_text`)
  90.  
  91.  
  92. ### Build the table
  93.  
  94. # Start building a `jumps` table which will contain the movie info and jump scare times.
  95.  
  96. # First we select only required columns from `table`, clean up the names, and attach the `runtimes`.
  97. # Clean names and select the most important columns:
  98. jumps <- table %>%
  99.   rename(name = "Movie Name",
  100.          year = Year,
  101.          jump_count = "Jump Count",
  102.          jump_scare_rating = "Jump Scare Rating") %>%
  103.   mutate(id = row_number(),
  104.          runtime = runtime) %>%
  105.   filter(jump_count > 0) %>% # exclude movies with no jumps
  106.   select(id, name, year, runtime, jump_count, jump_scare_rating)
  107. num_movies <- nrow(jumps)
  108.  
  109. # In general there is more than one jump scare in each movie. We need the data in long format
  110. # so we can add the `jump_time`s and `major_jump`s to the table. We can use `expand()` to do this.
  111. # Once the table is expanded, add the `jump_time` and `major_jump` columns:
  112. jumps <- jumps %>%
  113.   mutate(i = jump_count) %>% # `i` is an index variable for expanding
  114.   group_by(id, name, year, runtime, jump_count, jump_scare_rating) %>%
  115.   expand(i = seq(1:i)) %>%
  116.   ungroup %>%
  117.   mutate(jump_time = unlist(jump_time),
  118.          major_jump = unlist(major_jump)) %>%
  119.   rename(jump_number = i)
  120.  
  121. # Continue cleaning. The `runtime` and `jump_time` columns are in different formats.
  122. # Convert them both to seconds so that they are compatible. Then add a `jump_location`
  123. # column which indicates the point in the movie where the jump scare occurs
  124. # (`0` = start of movie, `50` = middle of movie, `100` = end of movie).
  125. # Calculate the `jump_location` by dividing the `jump_time` by the `runtime`:
  126. jumps <- jumps %>%
  127.   mutate(jump_time = period_to_seconds(hms(jump_time)),
  128.          runtime = 60*runtime,
  129.          jump_location = 100*round(jump_time/runtime, 2)) %>%
  130.   mutate(jump_location = as.integer(jump_location))
  131.  
  132. # Continue cleaning. If we inspect the `jump_location` column we see some anomalies:
  133. jumps$jump_location[which(jumps$jump_location > 100)]
  134.  
  135. # There are a number of jump scares with a `jump_location` greater than 100 which should be impossible
  136. # based on how we defined the variable. There are three bad points with values over 1,000 which we will remove.
  137. # For the others which are close to 100, we will set the value to 100. These values may have occured due a
  138. # discrepancy between the runtime and a jump scare occurring at the very end of the movie:
  139. jumps <- jumps %>%
  140.   filter(jump_location <= 1000) %>% # remove 3 bad `jump_location` values
  141.   mutate(jump_location = ifelse(jump_location <= 100, jump_location, 100)) # adjust the remaining anomalies down to 100 (i.e. end of movie)
  142.  
  143. # Save the cleaned `jumps` table to disk for ease of access later:
  144. # saveRDS(jumps, str_c(getwd(), "/jumps_cleaned.rds"))
  145. # jumps <- readRDS(str_c(getwd(), "/jumps_cleaned.rds"))
  146.  
  147.  
  148. ### Visualise the data
  149.  
  150. # First, construct a `timeline` data frame which consists of 100 time `segment`s for each movie.
  151. # Plot each movie as a straight line, with the movies ordered by "scariness" on the y-axis (scariest at the top).
  152. # Then, whenever a jump occurs in the movie, add a jump to the line for that movie. If it's a major jump
  153. # add a bigger jump to the line:
  154.  
  155. # Order the movies by scariness and save the order of the `id`s:
  156. ids <- jumps %>%
  157. mutate(jump_scare_rating_adj = jump_scare_rating * jump_count) %>%
  158. arrange(jump_scare_rating_adj, jump_scare_rating, jump_count) %>%
  159. select(id) %>%
  160. unlist() %>%
  161. unique()
  162.  
  163. # Add a trend line for the average amount of jump scares over time:
  164. n <- 8 # number of cuts
  165. a = 100/n # parameter for getting sequence of midpoints
  166. jumps_cut <- jumps %>% # table of average amount of jump scares at each point
  167. mutate(jump_location_cut = cut_interval(jump_location, n)) %>%
  168. group_by(jump_location_cut) %>%
  169. summarise(average_jump_count = sum(jump_count)/num_movies) %>%
  170. mutate(x = seq(a/2, 100 - a/2, by = 100/n),
  171. y = scale(average_jump_count, center = TRUE)*100 + length(unique(jumps$id))/2)
  172.  
  173. # Build the timeline for each movie and add the jump scares:
  174. timeline <- tibble(
  175. id = ids
  176. ) %>%
  177. mutate(count = 101, # 100 + 1, as we start at zero
  178. scare_rank = row_number()) %>%
  179. uncount(count) %>%  # Ref: https://stackoverflow.com/a/55492365
  180. mutate(segment = rep(0:100, length(unique(jumps$id)))) %>%
  181. left_join(jumps, by = c("id", "segment" = "jump_location")) %>%
  182. select(id, scare_rank, segment, jump_scare_rating, jump_number, major_jump) %>%
  183. mutate(jump_intensity = ifelse(is.na(major_jump), scare_rank, ifelse(major_jump, scare_rank + 20, scare_rank + 10))) # +20 for major_jump, +10 for !major_jump, 0 for NA
  184.  
  185. # Plot:
  186. # Recommended figure size if using R Notebook: fig.width=12, fig.height = 8
  187. g <- ggplot(timeline) +
  188.   geom_line(aes(segment, jump_intensity, group = scare_rank), # movie timelines with jumps where appropriate
  189.             size = 0.5, alpha = 0.2, color = "white",
  190.             show.legend = FALSE) +
  191.   geom_point(data = jumps_cut, # points for the trend line
  192.             aes(x, y, color = "Average 'jumpiness' over time")) +
  193.   geom_line(data = jumps_cut, # trend line for average number of jump scares over time
  194.             aes(x, y, color = "Average 'jumpiness' over time"),
  195.             size = 2,
  196.             alpha = 0.5) +
  197.   labs(title = expression(paste("Visualizing jump scares in 666 scary movies rated by ",
  198.                                 italic("wheresthejump.com"))),
  199.        subtitle = "The average 'jumpiness' of scary movies increases as the movie goes on before dropping off steeply at the end.",
  200.        caption = expression(paste("Created by: ",
  201.                             italic("reddit.com/user/brianhaas19"), "\tData source: ",
  202.                             italic("https://wheresthejump.com/full-movie-list/")))) +
  203.   scale_x_continuous(name = NULL,
  204.                      breaks = c(2, 50, 98),
  205.                      labels = c("Start", "Middle of Movie", "End"),
  206.                      expand = c(0, 0)) +
  207.   scale_y_continuous(name = "Movie Scariness",
  208.                      breaks = c(10, 325, 640), # positioning of the labels
  209.                      labels = c("Not Scary", "Scary", "Very Scary!"),
  210.                      expand = c(0, 0)) +
  211.   scale_color_manual(name = NULL, values = c("Average 'jumpiness' over time" = "white")) +
  212.   theme_bw() +
  213.   theme(plot.background = element_rect(fill = "#222222"), # grey taken from wheresthejump.com
  214.         panel.background = element_rect(fill = "#410000"), # red taken from wheresthejump.com
  215.         panel.grid = element_blank(),
  216.         plot.title = element_text(color = "white", size = 14),
  217.         plot.subtitle = element_text(color = "white", size = 12),
  218.         plot.caption = element_text(color = "white", size = 10, hjust = 0),
  219.         axis.title = element_text(color = "white", size = 12),
  220.         axis.text = element_text(color = "white", size = 10),
  221.         axis.line = element_line(color = "white"),
  222.         legend.position = "bottom",
  223.         legend.background = element_rect(fill = "#410000"),
  224.         legend.key = element_rect(fill = "#410000"),
  225.         legend.text = element_text(color = "white", size = 12),
  226.         legend.key.width = unit(2, "cm"))
  227. g
  228. #ggsave(file = str_c(getwd(), "/movie_jump_scares2.png"), plot = g, width = 12, height = 8) # uncomment to save to disk in current working directory
  229. ###################################################################################################
Add Comment
Please, Sign In to add comment