Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- getBREFTeamStatTable <- function(season_end = 2015, table_name = 'team', date = T){
- c('rvest','dplyr','pipeR') -> packages
- lapply(packages, library, character.only = T)
- 'http://www.basketball-reference.com/leagues/' -> base
- 'NBA' -> league
- '#' %>>% paste0(table_name) -> css_page
- css_page %>>% paste0(" , ", css_page,' a') -> css_id
- table_name %>>% tolower -> table_name
- table_name %>>% paste('stats', sep = "_") -> table
- base %>>% paste0(league,'_',season_end,".html") -> url
- url %>>% ## get table
- html %>>%
- html_nodes(css_page) %>>%
- html_table(header = F) %>>% data.frame() %>>% tbl_df() -> df
- if(df$X.1[1] == 'Rk'){
- df %>>%
- filter(X.1 == "Rk") %>>% as.character -> names
- 'Rk' %>>% grep(x = df$X.1) -> row_of_header #находим ранг
- (row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #пропускаем этот ряд и идем до конца
- names %>>% tolower-> names(df)} else{
- df %>>%
- filter(X.1 == "Rk") %>>% as.character -> names
- 'Rk' %>>% grep(x = df$X.1) -> row_of_header #находим ранг
- (row_of_header + 1) %>>% (df[.:nrow(df),]) -> df #пропускаем этот ряд и идем до конца
- names %>>% tolower-> names(df)
- }
- names(df) %>>% (gsub('\\%|/','\\.',.)) -> names(df)
- NULL -> df$rk
- c('team','arena') -> table_name_character
- df[,!(df %>>% names) %in% table_name_character] %>>%
- apply(2, function(x) gsub('\\,','',x) %>>% as.numeric(x)) ->
- df[,!(df %>>% names) %in% table_name_character] #убираем запятые, переводим в числовой формат
- df$team %>>% grepl(pattern = '\\*') -> df$playoff_team
- df$team %>>% (gsub('\\*','',.)) -> df$team
- df %>>% nrow() -1 -> rows
- df[1:rows,] -> df
- (season_end-1) %>>% paste0("-",season_end) -> season
- ##Собираем ID команд
- url %>>% ## get table
- html %>>%
- html_nodes(css_id) %>>%
- html_attrs() %>>% unlist %>>% as.character -> stems
- stems[3:length(stems)] -> stems #пропускаем первые два ряда, потому что там заголовки
- stems %>>% (gsub('\\/|.html|teams','',.)) %>>% #убираем ненужный текст
- (gsub(season_end,'',.)) -> bref_team_id #убираем год, получаем id команды
- data.frame(season,table_name = table, bref_team_id, df) -> df #объединяем в 1 df
- if(date == T){
- Sys.time() -> df$scrape_time #add scrape time if you want it
- }
- return(df)
- }
- #This paste is a code from tutorial by Alex Bresler
- # http://asbcllc.com/blog/2014/november/creating_bref_scraper/
- # Comments translated into Russian for
- # http://www.datadrivenjournalism.ru/2015/03/webscrape-in-r/
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement