brianhaas19

NFL Stadium Attendance

Oct 18th, 2020 (edited)
396
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 8.99 KB | None | 0 0
  1. ###################################################################################################
  2. # NFL Stadium Attendance
  3. # Average regular season home game attendance at NFL stadiums from 2000 to 2019. Includes win percentage,
  4. # and whether playoffs were reached that year (playoff attendance NOT included).
  5. # Created by: reddit.com/user/brianhaas19
  6.  
  7. # Link to data:
  8. # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md
  9. # https://en.wikipedia.org/wiki/NFL_International_Series
  10.  
  11. # Setup
  12. library(tidyverse)
  13. library(tidytuesdayR)
  14. library(scales)
  15. library(rvest)
  16.  
  17. # Load data:
  18. tt <- tt_load("2020-02-04")
  19. attendance <- tt$attendance
  20. standings <- tt$standings
  21.  
  22.  
  23. # Build a data frame containing details of the NFL International Series:
  24. # URL for Wiki page:
  25. url <- "https://en.wikipedia.org/wiki/NFL_International_Series"
  26.  
  27. # Given an xpath to a table with cells that span multiple rows, read and return the table as a data frame:
  28. # (Ref: https://www.scitilab.com/post_data/read_table/2019_09_11_readtable/)
  29. get_multi_row_table <- function(xpath) {
  30.  
  31.   # Get the lines of the table
  32.   lines <- url %>%
  33.     read_html() %>%
  34.     html_nodes(xpath = xpath) %>%
  35.     html_nodes(xpath = 'tbody/tr')
  36.  
  37.   # Initiate the (empty) table
  38.   ncol <-  lines %>%
  39.     .[[1]] %>%
  40.     html_children()%>%
  41.     length()
  42.  
  43.   nrow <- length(lines) - 1
  44.  
  45.   table <- as.data.frame(matrix(nrow = nrow,ncol = ncol))
  46.  
  47.   names(table) <- lines[[1]]%>%
  48.     html_children()%>%
  49.     html_text()%>%
  50.     gsub("\n","",.)
  51.  
  52.   # Fill the table
  53.   for(i in 1:nrow){
  54.     # Get content of the line:
  55.     linecontent <- lines[[i+1]]%>%
  56.       html_children()%>%
  57.       html_text()%>%
  58.       gsub("\n","",.)
  59.    
  60.     # Get the line repetition of each column
  61.     repetition <- lines[[i+1]]%>%
  62.       html_children()%>%
  63.       html_attr("rowspan")%>%
  64.       ifelse(is.na(.),1,.) %>% # if no rowspan, then it is a normal row, not a multiple one
  65.       as.numeric
  66.    
  67.     # Select only free columns
  68.     colselect <- is.na(table[i,])
  69.    
  70.     # Repeat the cells of the multiple rows down
  71.     for(j in 1:length(repetition)) {
  72.       span <- repetition[j]
  73.       if(sum(colselect)>1) {
  74.         table[(i):(i+span-1),colselect][,j] <- rep(linecontent[j],span)  
  75.       } else {
  76.         table[(i):(i+span-1),colselect] <- rep(linecontent[j],span)
  77.       }
  78.     }
  79.   }
  80.  
  81.   return(table)
  82. }
  83.  
  84. # Get the London International Series games:
  85. london <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[1]')
  86.  
  87. # There is a duplicate column name 'score', so fix this and rename other columns:
  88. names <- names(london) %>% tolower()
  89. names[3] <- "away_team"
  90. names[4] <- "away_score"
  91. names[5] <- "home_team"
  92. names[6] <- "home_score"
  93.  
  94. # Clean the table:
  95. london <- london %>%
  96.   set_names(names) %>%
  97.   mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
  98.          home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
  99.          year = str_extract(year, "^\\d{4}") %>% as.double(),
  100.          attendance = parse_number(attendance)) %>%
  101.   select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything()) %>%
  102.   select(-"pre-game show")
  103.  
  104. # Get the Mexico International Series games:
  105. mexico <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[2]')
  106.  
  107. # There is a duplicate column name 'score', so fix this and rename other columns:
  108. names <- names(mexico) %>% tolower()
  109. names[3] <- "away_team"
  110. names[4] <- "away_score"
  111. names[5] <- "home_team"
  112. names[6] <- "home_score"
  113.  
  114. # Clean the table:
  115. mexico <- mexico %>%
  116.   set_names(names) %>%
  117.   filter(attendance != "N/A") %>% # One game was cancelled, remove it
  118.   mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
  119.          home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
  120.          year = as.double(year),
  121.          attendance = parse_number(attendance)) %>%
  122.   select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything())
  123.  
  124. # Combine:
  125. international_series <- rbind(london, mexico)
  126.  
  127. # There is one other significant regular season game played abroad, the so-called NFL Fútbol Americano game. The attendance was very large (over 100k), so enter this game into the table manually:
  128. international_series <- international_series %>%
  129.   rbind(c(2005, "October 2", "San Francisco 49ers", "49ers", 14, "Arizona Cardinals", "Cardinals", 31, "Estadio Azteca", 103467)) %>%
  130.   mutate(year = as.double(year),
  131.          away_score = as.double(away_score),
  132.          home_score = as.double(home_score),
  133.          attendance = parse_number(attendance))
  134.  
  135. # Join `attendance` and `international_series` to adjust the total home attendance figure to account for nominal home games that were actually played in another stadium:
  136. attendance <- attendance %>%
  137.   left_join(international_series[ , c("year", "home_team_name", "attendance")],
  138.             by = c("year", "team_name" = "home_team_name")) %>%
  139.   rename(international_attendance = attendance) %>%
  140.   mutate(num_home_games = ifelse(is.na(international_attendance), 8, 7),
  141.          international_attendance = ifelse(num_home_games == 8, 0, international_attendance),
  142.          home_adjusted = home - international_attendance, # adjust the total home attendance down to account for International Series games
  143.          home_average = home_adjusted / num_home_games) %>%
  144.   dplyr::select(team_name, year, week, home_total = home, home_adjusted, num_home_games, home_average) %>%
  145.   filter(week == 1) # the season totals and averages are duplicated for each week, we only need one for each season
  146.  
  147. # Join `attendance` and `standings` to get additional data related to win percentages and playoff appearances:
  148. attendance <- attendance %>%
  149.   left_join(standings, by = c("team_name", "year")) %>%
  150.   mutate(win_rate = (wins)/(wins + loss),
  151.          playoffs_sb_winner = ifelse(sb_winner == "Won Superbowl",
  152.                                      "Won Superbowl",
  153.                                      ifelse(playoffs == "Playoffs",
  154.                                             "Reached Playoffs",
  155.                                             "No Playoffs"))) %>%
  156.   dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
  157.  
  158.  
  159. # Colors
  160. colors <- c("nfl_blue" = "#003069",
  161.             "nfl_red" = "#D60303",
  162.             "nfl_grey" = "#EEEEEE",
  163.             "nfl_lightgrey" = "#FAFAFA")
  164.  
  165.  
  166. # Plot
  167. g <- attendance %>%
  168.   ggplot() +
  169.   geom_rect(aes(xmin = year - 0.5, xmax = year + 0.5, fill = win_rate), # *1
  170.             ymin = -Inf, ymax = Inf) + # removed the upper limit here as the triangles look fine on top of the filled rectanles
  171.   geom_line(aes(year, home_average)) +
  172.   geom_point(aes(year, 1.0e5, shape = playoffs_sb_winner),
  173.              fill = "black") +
  174.   facet_wrap(~ team_name, nrow = 8, scales = "free_x") +
  175.   labs(title = "NFL Stadium Attendance",
  176.        subtitle = str_wrap("Average regular season home game attendance at NFL stadiums from 2000 to 2019. Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).", 120),
  177.        caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md")))) +
  178.   scale_x_continuous(name = NULL, # removed x-axis title as it's kind of redundant
  179.                      expand = c(0, 0)) +
  180.   scale_y_continuous(name = "Annual Attendance",
  181.                      label = comma,
  182.                      limits = c(NA, 1.0e5)) + # match the y value of the triangles to make them look like they hang from the top
  183.   scale_fill_gradient2(name = "Win %:",
  184.                        low = colors["nfl_blue"], mid = "white", high = colors["nfl_red"], midpoint = 0.5) +
  185.   scale_shape_manual(name = "Playoffs:",
  186.                      breaks = c("Reached Playoffs", "Won Superbowl"), # *3
  187.                      values = c("No Playoffs" = 32, "Reached Playoffs" = 6, "Won Superbowl" = 25)) + # *2
  188.   guides(fill = guide_colourbar(order = 1, title.vjust = 0.8, direction = "horizontal"), # *4
  189.          shape = guide_legend(order = 3, direction = "vertical", title.position = "left")) +
  190.   theme(legend.position = "bottom",
  191.         legend.box.just = c(0.5, 0.5),
  192.         legend.box.spacing = unit(0, "cm"),
  193.         legend.spacing = unit(2, "cm"),
  194.         legend.background = element_rect(fill = colors["nfl_grey"]),
  195.         legend.key = element_rect(fill = colors["nfl_grey"]),
  196.         strip.background = element_rect(fill = colors["nfl_blue"]),
  197.         strip.text = element_text(color = colors["nfl_lightgrey"]),
  198.         plot.background = element_rect(fill = colors["nfl_grey"]),
  199.         plot.caption = element_text(hjust = 0))
  200.  
  201. g
  202. #ggsave(file = str_c(getwd(), "/nfl.png"), plot = g, width = 12, height = 15) # uncomment to save to disk in current working directory
  203. ###################################################################################################
Add Comment
Please, Sign In to add comment