Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################################################################################################
- # NFL Stadium Attendance
- # Average regular season home game attendance at NFL stadiums from 2000 to 2019. Includes win percentage,
- # and whether playoffs were reached that year (playoff attendance NOT included).
- # Created by: reddit.com/user/brianhaas19
- # Link to data:
- # https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md
- # https://en.wikipedia.org/wiki/NFL_International_Series
- # Setup
- library(tidyverse)
- library(tidytuesdayR)
- library(scales)
- library(rvest)
- # Load data:
- tt <- tt_load("2020-02-04")
- attendance <- tt$attendance
- standings <- tt$standings
- # Build a data frame containing details of the NFL International Series:
- # URL for Wiki page:
- url <- "https://en.wikipedia.org/wiki/NFL_International_Series"
- # Given an xpath to a table with cells that span multiple rows, read and return the table as a data frame:
- # (Ref: https://www.scitilab.com/post_data/read_table/2019_09_11_readtable/)
- get_multi_row_table <- function(xpath) {
- # Get the lines of the table
- lines <- url %>%
- read_html() %>%
- html_nodes(xpath = xpath) %>%
- html_nodes(xpath = 'tbody/tr')
- # Initiate the (empty) table
- ncol <- lines %>%
- .[[1]] %>%
- html_children()%>%
- length()
- nrow <- length(lines) - 1
- table <- as.data.frame(matrix(nrow = nrow,ncol = ncol))
- names(table) <- lines[[1]]%>%
- html_children()%>%
- html_text()%>%
- gsub("\n","",.)
- # Fill the table
- for(i in 1:nrow){
- # Get content of the line:
- linecontent <- lines[[i+1]]%>%
- html_children()%>%
- html_text()%>%
- gsub("\n","",.)
- # Get the line repetition of each column
- repetition <- lines[[i+1]]%>%
- html_children()%>%
- html_attr("rowspan")%>%
- ifelse(is.na(.),1,.) %>% # if no rowspan, then it is a normal row, not a multiple one
- as.numeric
- # Select only free columns
- colselect <- is.na(table[i,])
- # Repeat the cells of the multiple rows down
- for(j in 1:length(repetition)) {
- span <- repetition[j]
- if(sum(colselect)>1) {
- table[(i):(i+span-1),colselect][,j] <- rep(linecontent[j],span)
- } else {
- table[(i):(i+span-1),colselect] <- rep(linecontent[j],span)
- }
- }
- }
- return(table)
- }
- # Get the London International Series games:
- london <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[1]')
- # There is a duplicate column name 'score', so fix this and rename other columns:
- names <- names(london) %>% tolower()
- names[3] <- "away_team"
- names[4] <- "away_score"
- names[5] <- "home_team"
- names[6] <- "home_score"
- # Clean the table:
- london <- london %>%
- set_names(names) %>%
- mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
- home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
- year = str_extract(year, "^\\d{4}") %>% as.double(),
- attendance = parse_number(attendance)) %>%
- select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything()) %>%
- select(-"pre-game show")
- # Get the Mexico International Series games:
- mexico <- get_multi_row_table(xpath = '//*[@id="mw-content-text"]/div[1]/table[2]')
- # There is a duplicate column name 'score', so fix this and rename other columns:
- names <- names(mexico) %>% tolower()
- names[3] <- "away_team"
- names[4] <- "away_score"
- names[5] <- "home_team"
- names[6] <- "home_score"
- # Clean the table:
- mexico <- mexico %>%
- set_names(names) %>%
- filter(attendance != "N/A") %>% # One game was cancelled, remove it
- mutate(away_team_name = str_extract(away_team, "[0-9A-Za-z]*$"),
- home_team_name = str_extract(home_team, "[0-9A-Za-z]*$"),
- year = as.double(year),
- attendance = parse_number(attendance)) %>%
- select(year:away_team, away_team_name, away_score:home_team, home_team_name, everything())
- # Combine:
- international_series <- rbind(london, mexico)
- # There is one other significant regular season game played abroad, the so-called NFL Fútbol Americano game. The attendance was very large (over 100k), so enter this game into the table manually:
- international_series <- international_series %>%
- rbind(c(2005, "October 2", "San Francisco 49ers", "49ers", 14, "Arizona Cardinals", "Cardinals", 31, "Estadio Azteca", 103467)) %>%
- mutate(year = as.double(year),
- away_score = as.double(away_score),
- home_score = as.double(home_score),
- attendance = parse_number(attendance))
- # Join `attendance` and `international_series` to adjust the total home attendance figure to account for nominal home games that were actually played in another stadium:
- attendance <- attendance %>%
- left_join(international_series[ , c("year", "home_team_name", "attendance")],
- by = c("year", "team_name" = "home_team_name")) %>%
- rename(international_attendance = attendance) %>%
- mutate(num_home_games = ifelse(is.na(international_attendance), 8, 7),
- international_attendance = ifelse(num_home_games == 8, 0, international_attendance),
- home_adjusted = home - international_attendance, # adjust the total home attendance down to account for International Series games
- home_average = home_adjusted / num_home_games) %>%
- dplyr::select(team_name, year, week, home_total = home, home_adjusted, num_home_games, home_average) %>%
- filter(week == 1) # the season totals and averages are duplicated for each week, we only need one for each season
- # Join `attendance` and `standings` to get additional data related to win percentages and playoff appearances:
- attendance <- attendance %>%
- left_join(standings, by = c("team_name", "year")) %>%
- mutate(win_rate = (wins)/(wins + loss),
- playoffs_sb_winner = ifelse(sb_winner == "Won Superbowl",
- "Won Superbowl",
- ifelse(playoffs == "Playoffs",
- "Reached Playoffs",
- "No Playoffs"))) %>%
- dplyr::select(team_name, year, week, home_total, home_adjusted, num_home_games, home_average, wins, loss, win_rate, playoffs, sb_winner, playoffs_sb_winner)
- # Colors
- colors <- c("nfl_blue" = "#003069",
- "nfl_red" = "#D60303",
- "nfl_grey" = "#EEEEEE",
- "nfl_lightgrey" = "#FAFAFA")
- # Plot
- g <- attendance %>%
- ggplot() +
- geom_rect(aes(xmin = year - 0.5, xmax = year + 0.5, fill = win_rate), # *1
- ymin = -Inf, ymax = Inf) + # removed the upper limit here as the triangles look fine on top of the filled rectanles
- geom_line(aes(year, home_average)) +
- geom_point(aes(year, 1.0e5, shape = playoffs_sb_winner),
- fill = "black") +
- facet_wrap(~ team_name, nrow = 8, scales = "free_x") +
- labs(title = "NFL Stadium Attendance",
- subtitle = str_wrap("Average regular season home game attendance at NFL stadiums from 2000 to 2019. Includes win percentage, and whether playoffs were reached that year (playoff attendance NOT included).", 120),
- caption = expression(paste("Created by: ", italic("reddit.com/user/brianhaas19"), "\tData source: ", italic("https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-02-04/readme.md")))) +
- scale_x_continuous(name = NULL, # removed x-axis title as it's kind of redundant
- expand = c(0, 0)) +
- scale_y_continuous(name = "Annual Attendance",
- label = comma,
- limits = c(NA, 1.0e5)) + # match the y value of the triangles to make them look like they hang from the top
- scale_fill_gradient2(name = "Win %:",
- low = colors["nfl_blue"], mid = "white", high = colors["nfl_red"], midpoint = 0.5) +
- scale_shape_manual(name = "Playoffs:",
- breaks = c("Reached Playoffs", "Won Superbowl"), # *3
- values = c("No Playoffs" = 32, "Reached Playoffs" = 6, "Won Superbowl" = 25)) + # *2
- guides(fill = guide_colourbar(order = 1, title.vjust = 0.8, direction = "horizontal"), # *4
- shape = guide_legend(order = 3, direction = "vertical", title.position = "left")) +
- theme(legend.position = "bottom",
- legend.box.just = c(0.5, 0.5),
- legend.box.spacing = unit(0, "cm"),
- legend.spacing = unit(2, "cm"),
- legend.background = element_rect(fill = colors["nfl_grey"]),
- legend.key = element_rect(fill = colors["nfl_grey"]),
- strip.background = element_rect(fill = colors["nfl_blue"]),
- strip.text = element_text(color = colors["nfl_lightgrey"]),
- plot.background = element_rect(fill = colors["nfl_grey"]),
- plot.caption = element_text(hjust = 0))
- g
- #ggsave(file = str_c(getwd(), "/nfl.png"), plot = g, width = 12, height = 15) # uncomment to save to disk in current working directory
- ###################################################################################################
Add Comment
Please, Sign In to add comment