brianhaas19

NFL Attendance - Adjusted for Stadium Capacity

Oct 25th, 2020 (edited)
270
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 9.78 KB | None | 0 0
  1. ###################################################################################################
  2. # NFL Attendance - Adjusted for Stadium Capacity
  3. # Regular season home game attendance as a percentage of stadium capacity for NFL teams from 2000 to 2019.
  4. # Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).
  5. # Created by: reddit.com/user/brianhaas19
  6.  
  7. # Link to data
  8. # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md
  9. # https://en.wikipedia.org/wiki/NFL_International_Series
  10. # https://en.wikipedia.org/wiki/Chronology_of_home_stadiums_for_current_National_Football_League_teams
  11.  
  12. # Setup
  13. library(tidyverse)
  14. library(tidytuesdayR)
  15. library(scales)
  16. library(rvest)
  17.  
  18. # Load data:
  19. tt <- tt_load("2020-02-04")
  20. attendance <- tt$attendance
  21. standings <- tt$standings
  22. games <- tt$games
  23. capacity <- read_csv("capacity.csv",
  24.                      col_names = TRUE) %>% # 1st arg is wherever the csv file is stored (see https://pastebin.com/h7a7AzZn)
  25.   filter(!is.na(capacity)) %>%
  26.   mutate(capacity = round(capacity))
  27.  
  28. # Build a data frame containing details of the NFL International Series:
  29. # URL for Wiki page:
  30. url <- "https://en.wikipedia.org/wiki/NFL_International_Series"
  31.  
  32. # Given an xpath to a table with cells that span multiple rows, read and return the table as a data frame:
  33. # (Ref: https://www.scitilab.com/post_data/read_table/2019_09_11_readtable/)
  34. get_multi_row_table <- function(xpath) {
  35.  
  36.   # Get the lines of the table
  37.   lines <- url %>%
  38.     read_html() %>%
  39.     html_nodes(xpath = xpath) %>%
  40.     html_nodes(xpath = 'tbody/tr')
  41.  
  42.   # Initiate the (empty) table
  43.   ncol <-  lines %>%
  44.     .[[1]] %>%
  45.     html_children()%>%
  46.     length()
  47.  
  48.   nrow <- length(lines) - 1
  49.  
  50.   table <- as.data.frame(matrix(nrow = nrow,ncol = ncol))
  51.  
  52.   names(table) <- lines[[1]]%>%
  53.     html_children()%>%
  54.     html_text()%>%
  55.     gsub("\n","",.)
  56.  
  57.   # Fill the table
  58.   for(i in 1:nrow){
  59.     # Get content of the line:
  60.     linecontent <- lines[[i+1]]%>%
  61.       html_children()%>%
  62.       html_text()%>%
  63.       gsub("\n","",.)
  64.    
  65.     # Get the line repetition of each column
  66.     repetition <- lines[[i+1]]%>%
  67.       html_children()%>%
  68.       html_attr("rowspan")%>%
  69.       ifelse(is.na(.),1,.) %>% # if no rowspan, then it is a normal row, not a multiple one
  70.       as.numeric
  71.    
  72.     # Select only free columns
  73.     colselect <- is.na(table[i,])
  74.    
  75.     # Repeat the cells of the multiple rows down
  76.     for(j in 1:length(repetition)) {
  77.       span <- repetition[j]
  78.       if(sum(colselect)>1) {
  79.         table[(i):(i+span-1),colselect][,j] <- rep(linecontent[j],span)  
  80.       } else {
  81.         table[(i):(i+span-1),colselect] <- rep(linecontent[j],span)
  82.       }
  83.     }
  84.   }
  85.  
  86.   return(table)
  87. }
  88.  
  89. # Get the London International Series games:
  90. london <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[1]')
  91.  
  92. # There is a duplicate column name `score`, so fix this and rename other columns:
  93. names <- names(london) %>% tolower()
  94. names[3] <- "away_team"
  95. names[4] <- "away_score"
  96. names[5] <- "home_team"
  97. names[6] <- "home_score"
  98.  
  99. # Clean the table:
  100. london <- london %>%
  101.   set_names(names) %>%
  102.   mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
  103.          home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
  104.          year = str_extract(year, "^\\d{4}") %>% as.double(),
  105.          attendance = parse_number(attendance)) %>%
  106.   select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything()) %>%
  107.   select(-"pre-game show")
  108.  
  109. # Get the Mexico International Series games:
  110. mexico <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[2]')
  111.  
  112. # There is a duplicate column name `score`, so fix this and rename other columns:
  113. names <- names(mexico) %>% tolower()
  114. names[3] <- "away_team"
  115. names[4] <- "away_score"
  116. names[5] <- "home_team"
  117. names[6] <- "home_score"
  118.  
  119. # Clean the table:
  120. mexico <- mexico %>%
  121.   set_names(names) %>%
  122.   filter(attendance != "N/A") %>% # One game was cancelled, remove it
  123.   mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
  124.          home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
  125.          year = as.double(year),
  126.          attendance = parse_number(attendance)) %>%
  127.   select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything())
  128.  
  129. # Combine:
  130. international_series <- rbind(london, mexico)
  131.  
  132. # There is one other significant regular season game played abroad, the so-called NFL Fútbol Americano game.
  133. # The attendance was very large (over 100k), so enter this game into the table manually:
  134. international_series <- international_series %>%
  135.   rbind(c(2005, "October 2", "San Francisco 49ers", "49ers", 14, "Arizona Cardinals", "Cardinals", 31, "Estadio Azteca", 103467)) %>%
  136.   mutate(year = as.double(year),
  137.          away_score = as.double(away_score),
  138.          home_score = as.double(home_score),
  139.          attendance = parse_number(attendance))
  140.  
  141. # Join `attendance` and `international_series` to adjust the total home attendance figure to account for
  142. # nominal home games that were actually played in another stadium:
  143. attendance <- attendance %>%
  144.   left_join(international_series[ , c("year", "home_team_name", "attendance")],
  145.             by = c("year", "team_name" = "home_team_name")) %>%
  146.   rename(international_attendance = attendance) %>%
  147.   mutate(num_home_games = ifelse(is.na(international_attendance), 8, 7),
  148.          international_attendance = ifelse(num_home_games == 8, 0, international_attendance),
  149.          home_adjusted = home - international_attendance, # adjust the total home attendance down to account for International Series games
  150.          home_average = home_adjusted / num_home_games) %>%
  151.   dplyr::select(team_name, year, week, home_total = home, home_adjusted, num_home_games, home_average) %>%
  152.   filter(week == 1) # the season totals and averages are duplicated for each week, we only need one for each season
  153.  
  154. # Join `attendance` and `standings` to get additional data related to win percentages and playoff appearances:
  155. attendance <- attendance %>%
  156.   left_join(standings, by = c("team_name", "year")) %>%
  157.   mutate(win_rate = (wins)/(wins + loss),
  158.          playoffs_sb_winner = ifelse(sb_winner == "Won Superbowl",
  159.                                      "Won Superbowl",
  160.                                      ifelse(playoffs == "Playoffs",
  161.                                             "Reached Playoffs",
  162.                                             "No Playoffs"))) %>%
  163.   dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
  164.  
  165. # Join `attendance` and `capacity` to get capacity of stadiums for each year:
  166. attendance <- attendance %>%
  167.   left_join(capacity, by = c("team_name", "year")) %>%
  168.   mutate(percent_full = home_average/capacity) %>%
  169.   dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, capacity, percent_full,
  170.                 wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
  171.  
  172. # Colors
  173. colors <- c("nfl_blue" = "#003069",
  174.             "nfl_red" = "#D60303",
  175.             "nfl_grey" = "#EEEEEE",
  176.             "nfl_lightgrey" = "#FAFAFA")
  177.  
  178. # Plot
  179. g <- attendance %>%
  180.   ggplot() +
  181.   geom_rect(aes(xmin = year - 0.5, xmax = year + 0.5, fill = win_rate),
  182.             ymin = -Inf, ymax = Inf) +
  183.   geom_line(aes(year, percent_full)) +
  184.   geom_hline(yintercept = 1.0, linetype = "dashed") +
  185.   geom_point(aes(year, 1.2, shape = playoffs_sb_winner),
  186.              fill = "black") +
  187.   facet_wrap(~ team_name, nrow = 8, scales = "free_x") +
  188.   labs(title = "NFL Attendance - Adjusted for Stadium Capacity",
  189.        subtitle = str_wrap("Regular season home game attendance as a percentage of stadium capacity for NFL teams from 2000 to 2019. Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).", 120),
  190.        caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md")))) +
  191.   scale_x_continuous(name = NULL,
  192.                      expand = c(0, 0)) +
  193.   scale_y_continuous(name = "Annual Attendance as a Percentage of Stadium Capacity",
  194.                      breaks = c(0.6, 0.8, 1.0),
  195.                      label = percent(c(0.6, 0.8, 1.0), scale = 100),
  196.                      limits = c(NA, 1.2)) +
  197.   scale_fill_gradient2(name = "Win %:",
  198.                        low = colors["nfl_blue"], mid = "white", high = colors["nfl_red"], midpoint = 0.5) +
  199.   scale_shape_manual(name = "Playoffs:",
  200.                      breaks = c("Reached Playoffs", "Won Superbowl"), # *3
  201.                      values = c("No Playoffs" = 32, "Reached Playoffs" = 6, "Won Superbowl" = 25)) +
  202.   guides(fill = guide_colourbar(order = 1, title.vjust = 0.8, direction = "horizontal"),
  203.          shape = guide_legend(order = 3, direction = "vertical", title.position = "left")) +
  204.   theme(legend.position = "bottom",
  205.         legend.box.just = c(0.5, 0.5),
  206.         legend.box.spacing = unit(0, "cm"),
  207.         legend.spacing = unit(2, "cm"),
  208.         legend.background = element_rect(fill = colors["nfl_grey"]),
  209.         legend.key = element_rect(fill = colors["nfl_grey"]),
  210.         strip.background = element_rect(fill = colors["nfl_blue"]),
  211.         strip.text = element_text(color = colors["nfl_lightgrey"]),
  212.         plot.background = element_rect(fill = colors["nfl_grey"]),
  213.         plot.caption = element_text(hjust = 0))
  214.  
  215. g
  216. #ggsave(file = str_c(getwd(), "/nfl.png"), plot = g, width = 12, height = 15) # uncomment to save to disk in current working directory
  217. ###################################################################################################
Add Comment
Please, Sign In to add comment