Guest User

Scraping

a guest
Aug 28th, 2020
111
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 1.30 KB | None | 0 0
  1. require 'selenium-webdriver'
  2. require 'nokogiri'
  3. require 'pry'
  4. require "open-uri"
  5.  
  6. options = Selenium::WebDriver::Chrome::Options.new(args: ['headless'])
  7.  
  8. driver = Selenium::WebDriver.for(:chrome, options: options)
  9.  
  10. driver.get('https://www.instagram.com/instagram/')
  11. sleep 5
  12.  
  13. driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
  14. sleep 1
  15. driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
  16. sleep 1
  17. driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
  18. sleep 2
  19.  
  20. doc = Nokogiri::HTML(driver.page_source)
  21.  
  22. links = doc.search('a').map(&:values)
  23.  
  24. #cleaning
  25. s = []
  26. clean = []
  27.  links.each do |ele|
  28.   ele.each do |ele2|
  29.     if ele2.include?('/p')
  30.       s << ele2
  31.     end
  32.   end
  33.  end
  34.  
  35. s.each do |string|
  36.   string.scan /\/p\/.+/ do |i|
  37.     clean << i
  38.   end
  39. end
  40.  
  41. #links to each pic
  42. final_links = clean.map{|ele| "https://www.instagram.com#{ele}"}
  43.  
  44. #finding direct link to each pic
  45. arr = doc.search('img').map(&:values)
  46. arr.length
  47.  
  48. #cleaning
  49. images_hd = []
  50. arr.each do |ele|
  51.   ele.each do |link|
  52.     if link.include?("cover")
  53.       images_hd << ele[-2]
  54.     end
  55.   end
  56. end
  57.  
  58. #downloading the pics
  59.  images_hd.each_with_index do |pic,i|
  60.     File.open("imagen #{i}.png", 'wb') do |fo|
  61.       fo.write open(pic).read
  62.     end
  63. end
  64.  
  65. driver.quit
Add Comment
Please, Sign In to add comment