Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################################################################
- # NFL Attendance - Adjusted for Stadium Capacity
- # Regular season home game attendance as a percentage of stadium capacity for NFL teams from 2000 to 2019.
- # Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).
- # Created by: reddit.com/user/brianhaas19
- # Link to data
- # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md
- # https://en.wikipedia.org/wiki/NFL_International_Series
- # https://en.wikipedia.org/wiki/Chronology_of_home_stadiums_for_current_National_Football_League_teams
- # Setup
- library(tidyverse)
- library(tidytuesdayR)
- library(scales)
- library(rvest)
- # Load data:
- tt <- tt_load("2020-02-04")
- attendance <- tt$attendance
- standings <- tt$standings
- games <- tt$games
- capacity <- read_csv("capacity.csv",
- col_names = TRUE) %>% # 1st arg is wherever the csv file is stored (see https://pastebin.com/h7a7AzZn)
- filter(!is.na(capacity)) %>%
- mutate(capacity = round(capacity))
- # Build a data frame containing details of the NFL International Series:
- # URL for Wiki page:
- url <- "https://en.wikipedia.org/wiki/NFL_International_Series"
- # Given an xpath to a table with cells that span multiple rows, read and return the table as a data frame:
- # (Ref: https://www.scitilab.com/post_data/read_table/2019_09_11_readtable/)
- get_multi_row_table <- function(xpath) {
- # Get the lines of the table
- lines <- url %>%
- read_html() %>%
- html_nodes(xpath = xpath) %>%
- html_nodes(xpath = 'tbody/tr')
- # Initiate the (empty) table
- ncol <- lines %>%
- .[[1]] %>%
- html_children()%>%
- length()
- nrow <- length(lines) - 1
- table <- as.data.frame(matrix(nrow = nrow,ncol = ncol))
- names(table) <- lines[[1]]%>%
- html_children()%>%
- html_text()%>%
- gsub("\n","",.)
- # Fill the table
- for(i in 1:nrow){
- # Get content of the line:
- linecontent <- lines[[i+1]]%>%
- html_children()%>%
- html_text()%>%
- gsub("\n","",.)
- # Get the line repetition of each column
- repetition <- lines[[i+1]]%>%
- html_children()%>%
- html_attr("rowspan")%>%
- ifelse(is.na(.),1,.) %>% # if no rowspan, then it is a normal row, not a multiple one
- as.numeric
- # Select only free columns
- colselect <- is.na(table[i,])
- # Repeat the cells of the multiple rows down
- for(j in 1:length(repetition)) {
- span <- repetition[j]
- if(sum(colselect)>1) {
- table[(i):(i+span-1),colselect][,j] <- rep(linecontent[j],span)
- } else {
- table[(i):(i+span-1),colselect] <- rep(linecontent[j],span)
- }
- }
- }
- return(table)
- }
- # Get the London International Series games:
- london <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[1]')
- # There is a duplicate column name `score`, so fix this and rename other columns:
- names <- names(london) %>% tolower()
- names[3] <- "away_team"
- names[4] <- "away_score"
- names[5] <- "home_team"
- names[6] <- "home_score"
- # Clean the table:
- london <- london %>%
- set_names(names) %>%
- mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
- home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
- year = str_extract(year, "^\\d{4}") %>% as.double(),
- attendance = parse_number(attendance)) %>%
- select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything()) %>%
- select(-"pre-game show")
- # Get the Mexico International Series games:
- mexico <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[2]')
- # There is a duplicate column name `score`, so fix this and rename other columns:
- names <- names(mexico) %>% tolower()
- names[3] <- "away_team"
- names[4] <- "away_score"
- names[5] <- "home_team"
- names[6] <- "home_score"
- # Clean the table:
- mexico <- mexico %>%
- set_names(names) %>%
- filter(attendance != "N/A") %>% # One game was cancelled, remove it
- mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
- home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
- year = as.double(year),
- attendance = parse_number(attendance)) %>%
- select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything())
- # Combine:
- international_series <- rbind(london, mexico)
- # There is one other significant regular season game played abroad, the so-called NFL Fútbol Americano game.
- # The attendance was very large (over 100k), so enter this game into the table manually:
- international_series <- international_series %>%
- rbind(c(2005, "October 2", "San Francisco 49ers", "49ers", 14, "Arizona Cardinals", "Cardinals", 31, "Estadio Azteca", 103467)) %>%
- mutate(year = as.double(year),
- away_score = as.double(away_score),
- home_score = as.double(home_score),
- attendance = parse_number(attendance))
- # Join `attendance` and `international_series` to adjust the total home attendance figure to account for
- # nominal home games that were actually played in another stadium:
- attendance <- attendance %>%
- left_join(international_series[ , c("year", "home_team_name", "attendance")],
- by = c("year", "team_name" = "home_team_name")) %>%
- rename(international_attendance = attendance) %>%
- mutate(num_home_games = ifelse(is.na(international_attendance), 8, 7),
- international_attendance = ifelse(num_home_games == 8, 0, international_attendance),
- home_adjusted = home - international_attendance, # adjust the total home attendance down to account for International Series games
- home_average = home_adjusted / num_home_games) %>%
- dplyr::select(team_name, year, week, home_total = home, home_adjusted, num_home_games, home_average) %>%
- filter(week == 1) # the season totals and averages are duplicated for each week, we only need one for each season
- # Join `attendance` and `standings` to get additional data related to win percentages and playoff appearances:
- attendance <- attendance %>%
- left_join(standings, by = c("team_name", "year")) %>%
- mutate(win_rate = (wins)/(wins + loss),
- playoffs_sb_winner = ifelse(sb_winner == "Won Superbowl",
- "Won Superbowl",
- ifelse(playoffs == "Playoffs",
- "Reached Playoffs",
- "No Playoffs"))) %>%
- dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
- # Join `attendance` and `capacity` to get capacity of stadiums for each year:
- attendance <- attendance %>%
- left_join(capacity, by = c("team_name", "year")) %>%
- mutate(percent_full = home_average/capacity) %>%
- dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, capacity, percent_full,
- wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
- # Colors
- colors <- c("nfl_blue" = "#003069",
- "nfl_red" = "#D60303",
- "nfl_grey" = "#EEEEEE",
- "nfl_lightgrey" = "#FAFAFA")
- # Plot
- g <- attendance %>%
- ggplot() +
- geom_rect(aes(xmin = year - 0.5, xmax = year + 0.5, fill = win_rate),
- ymin = -Inf, ymax = Inf) +
- geom_line(aes(year, percent_full)) +
- geom_hline(yintercept = 1.0, linetype = "dashed") +
- geom_point(aes(year, 1.2, shape = playoffs_sb_winner),
- fill = "black") +
- facet_wrap(~ team_name, nrow = 8, scales = "free_x") +
- labs(title = "NFL Attendance - Adjusted for Stadium Capacity",
- subtitle = str_wrap("Regular season home game attendance as a percentage of stadium capacity for NFL teams from 2000 to 2019. Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).", 120),
- caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md")))) +
- scale_x_continuous(name = NULL,
- expand = c(0, 0)) +
- scale_y_continuous(name = "Annual Attendance as a Percentage of Stadium Capacity",
- breaks = c(0.6, 0.8, 1.0),
- label = percent(c(0.6, 0.8, 1.0), scale = 100),
- limits = c(NA, 1.2)) +
- scale_fill_gradient2(name = "Win %:",
- low = colors["nfl_blue"], mid = "white", high = colors["nfl_red"], midpoint = 0.5) +
- scale_shape_manual(name = "Playoffs:",
- breaks = c("Reached Playoffs", "Won Superbowl"), # *3
- values = c("No Playoffs" = 32, "Reached Playoffs" = 6, "Won Superbowl" = 25)) +
- guides(fill = guide_colourbar(order = 1, title.vjust = 0.8, direction = "horizontal"),
- shape = guide_legend(order = 3, direction = "vertical", title.position = "left")) +
- theme(legend.position = "bottom",
- legend.box.just = c(0.5, 0.5),
- legend.box.spacing = unit(0, "cm"),
- legend.spacing = unit(2, "cm"),
- legend.background = element_rect(fill = colors["nfl_grey"]),
- legend.key = element_rect(fill = colors["nfl_grey"]),
- strip.background = element_rect(fill = colors["nfl_blue"]),
- strip.text = element_text(color = colors["nfl_lightgrey"]),
- plot.background = element_rect(fill = colors["nfl_grey"]),
- plot.caption = element_text(hjust = 0))
- g
- #ggsave(file = str_c(getwd(), "/nfl.png"), plot = g, width = 12, height = 15) # uncomment to save to disk in current working directory
- ###################################################################################################
Add Comment
Please, Sign In to add comment