Advertisement
jorandradefig

scrape.R

Mar 9th, 2019
255
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.81 KB | None | 0 0
  1. rm(list=ls())
  2.  
  3. ########################
  4. # Web Scraping
  5. # @jorandradefig
  6. ########################
  7.  
  8. # instalar paquete
  9. install.packages("rvest")
  10.  
  11. # cargar paquete
  12. library("rvest")
  13.  
  14. # guardar la url
  15. url <- 'http://www.imdb.com/search/title?count=100&release_date=2016,2016&title_type=feature'
  16.  
  17. # guardar el html de la url
  18. webpage <- read_html(url)
  19.  
  20. # ranking
  21.  
  22. # extraer los nodos que tienen la clase text-primary
  23. rank_data_html <- html_nodes(webpage, '.text-primary')
  24.  
  25. # extraer el texto
  26. rank_data <- html_text(rank_data_html)
  27.  
  28. # imprimir los primeros valores
  29. head(rank_data)
  30.  
  31. # convertir a numérico
  32. rank_data <- as.numeric(rank_data)
  33.  
  34. # imprimir los primeros valores
  35. head(rank_data)
  36.  
  37. # títulos
  38.  
  39. title_data_html <- html_nodes(webpage, '.lister-item-header a')
  40.  
  41. title_data <- html_text(title_data_html)
  42.  
  43. head(title_data)
  44.  
  45. ########################################
  46. # clasificaciones
  47.  
  48. ########################################
  49.  
  50. # ganancias
  51.  
  52. gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
  53.  
  54. gross_data <- html_text(gross_data_html)
  55.  
  56. head(gross_data)
  57.  
  58. gross_data <- gsub("M", "", gross_data)
  59.  
  60. gross_data <- gsub(",", "", gross_data)
  61.  
  62. gross_data <- substring(gross_data, 2, 6)
  63.  
  64. head(gross_data)
  65.  
  66. rank_data <- rank_data[c(-91:-100)]
  67. title_data <- title_data[c(-91:-100)]
  68.  
  69. install.packages("rlist")
  70. library("rlist")
  71.  
  72. rank_data <- list.reverse(rank_data)
  73.  
  74. length(rank_data)
  75. length(gross_data)
  76. length(title_data)
  77.  
  78. movies <- data.frame(Rank = rank_data, Title = title_data, Gross = gross_data)
  79.  
  80. movies
  81.  
  82. install.packages('ggplot2')
  83.  
  84. library(ggplot2)
  85.  
  86. qplot(movies$Rank, geom="histogram", binwidth = 5, fill=I("blue"))
  87.  
  88. ggplot(movies, aes(Rank, Gross, colour = I("blue"), label = Title)) +
  89.     geom_point() +
  90.     guides(fill=FALSE, color=FALSE) +
  91.     geom_text()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement