Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list=ls())
- ########################
- # Web Scraping
- # @jorandradefig
- ########################
- # instalar paquete
- install.packages("rvest")
- # cargar paquete
- library("rvest")
- # guardar la url
- url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
- # guardar el html de la url
- webpage <- read_html(url)
- # ranking
- # extraer los nodos que tienen la clase text-primary
- rank_data_html <- html_nodes(webpage, '.text-primary')
- # extraer el texto
- rank_data <- html_text(rank_data_html)
- # imprimir los primeros valores
- head(rank_data)
- # convertir a numérico
- rank_data <- as.numeric(rank_data)
- # imprimir los primeros valores
- head(rank_data)
- # títulos
- title_data_html <- html_nodes(webpage, '.lister-item-header a')
- title_data <- html_text(title_data_html)
- head(title_data)
- ########################################
- # clasificaciones
- ########################################
- # ganancias
- gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
- gross_data <- html_text(gross_data_html)
- head(gross_data)
- gross_data <- gsub("M", "", gross_data)
- gross_data <- gsub(",", "", gross_data)
- gross_data <- substring(gross_data, 2, 6)
- head(gross_data)
- rank_data <- rank_data[c(-91:-100)]
- title_data <- title_data[c(-91:-100)]
- install.packages("rlist")
- library("rlist")
- rank_data <- list.reverse(rank_data)
- length(rank_data)
- length(gross_data)
- length(title_data)
- movies <- data.frame(Rank = rank_data, Title = title_data, Gross = gross_data)
- movies
- install.packages('ggplot2')
- library(ggplot2)
- qplot(movies$Rank, geom="histogram", binwidth = 5, fill=I("blue"))
- ggplot(movies, aes(Rank, Gross, colour = I("blue"), label = Title)) +
- geom_point() +
- guides(fill=FALSE, color=FALSE) +
- geom_text()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement