- Downloading Live Olympic Medal Data into R
- library(XML)
- library(RCurl)
- theurl <- "http://www.london2012.com/medals/medal-count/"
- page <- getURL(theurl)
- page # fail
- [1] "<HTML><HEAD>n<TITLE>Access Denied</TITLE>n</HEAD><BODY>n<H1>Access Denied</H1>n nYou don't have permission to access "http://www.london2012.com/medals/medal-count/" on this server.<P>nReference #18.358a503f.1343590091.c056ae2n</BODY>n</HTML>n"
- page <- readHTMLTable(theurl)
- page <- getURLContent(theurl, useragent="Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2")
- rr <- readHTMLTable(page,header=FALSE)
- rr2 <- setNames(rr[[1]],
- c("rank","country","gold","silver","bronze","junk","total"))
- rr3 <- subset(rr2,select=-junk)
- ## oops, numbers all got turned into factors ...
- tmpf <- function(x) { as.numeric(as.character(x)) }
- rr3[,-2] <- sapply(rr3[,-2],tmpf)
- head(rr3)
- ## rank country gold silver bronze total
- ## 1 1 People's Republic of China 6 4 2 12
- ## 2 2 United States of America 3 5 3 11
- ## 3 3 Italy 2 3 2 7
- ## 4 4 Republic of Korea 2 1 2 5
- ## 5 5 France 2 1 1 4
- ## 6 6 Democratic People's Republic of Korea 2 0 1 3
- with(rr3,dotchart(total,country))
- # file <- "~/Documents/R/medals.html"
- # page <- readChar(file,file.info(file)$size)
- library(RCurl)
- theurl <- "http://www.london2012.com/medals/medal-count/"
- page <- getURLContent(theurl, useragent="Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2")
- # Remove html tags:
- page <- gsub("<(.|n)*?>","",page)
- # Remove newlines and tabs:
- page <- gsub("\n","",page)
- # match table:
- page <- regmatches(page,regexpr("(?<=Total).*(?=Detailed)",page,perl=TRUE))
- # Extract country+medals+rank
- codes <-regmatches(page,gregexpr("\d+[^\r]*\d+",page,perl=TRUE))[[1]]
- codes <- codes[seq(1,length(codes)-2,by=2)]
- # Extract country and medals:
- Names <- gsub("\d","",codes)
- Medals <- sapply(regmatches(codes,gregexpr("\d",codes)),function(x)x[(length(x)-2):length(x)])
- # Create data frame:
- data.frame(
- Country = Names,
- Gold = as.numeric(Medals[1,]),
- Silver = as.numeric(Medals[2,]),
- Bronze = as.numeric(Medals[3,]))
- Country Gold Silver Bronze
- 1 People's Republic of China 6 4 2
- 2 United States of America 3 5 3
- 3 Italy 2 3 2
- 4 Republic of Korea 2 1 2
- 5 France 2 1 1
- 6 Democratic People's Republic of Korea 2 0 1
- 7 Kazakhstan 2 0 0
- 8 Australia 1 1 1
- 9 Brazil 1 1 1
- 10 Hungary 1 1 1
- 11 Netherlands 1 1 0
- 12 Russian Federation 1 0 3
- 13 Georgia 1 0 0
- 14 South Africa 1 0 0
- 15 Japan 0 2 3
- 16 Great Britain 0 1 1
- 17 Colombia 0 1 0
- 18 Cuba 0 1 0
- 19 Poland 0 1 0
- 20 Romania 0 1 0
- 21 Taipei (Chinese Taipei) 0 1 0
- 22 Azerbaijan 0 0 1
- 23 Belgium 0 0 1
- 24 Canada 0 0 1
- 25 Republic of Moldova 0 0 1
- 26 Norway 0 0 1
- 27 Serbia 0 0 1
- 28 Slovakia 0 0 1
- 29 Ukraine 0 0 1
- 30 Uzbekistan 0 0 1