Advertisement
Guest User

teengallery.com image scraper

a guest
Dec 1st, 2017
5,397
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 3.80 KB | None | 0 0
  1. require 'excon'
  2. require 'fileutils'
  3. require 'nokogiri'
  4. require 'open-uri'
  5. require 'rmagick'
  6. require 'ruby-progressbar'
  7.  
  8. @working_directory = Dir.pwd
  9. @headers = {
  10.   'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  11.   'Accept-Language' => 'en-US,en;q=0.5',
  12.   'Connection' => 'keep-alive',
  13.   'DNT' => 1,
  14.   'Host' => 'teengallery.com',
  15.   'Upgrade-Insecure-Requests' => 1,
  16.   'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'
  17. }
  18.  
  19. def url_encode(str)
  20.   return URI::encode(str).gsub("[","%5B").gsub("]","%5D")
  21. end
  22.  
  23. # Perform an HTTP GET and parse the response
  24. def fetch_html(uri)
  25.   connection = Excon.new(url_encode(uri))
  26.   response = connection.request(method:'GET', idempotent:true, retry_limit:5, headers:@headers)
  27.   return Nokogiri::HTML(response.body)
  28. end
  29.  
  30. # Create symlinks from image to associated image-list directories
  31. def create_image_lists(html)
  32.   image_num = html.css('div.maincolumn--title h1').text().split('#').last.to_i()
  33.   image_filename = "%07d.jpg" % image_num
  34.   filepath = "#{@working_directory}/images/#{image_filename}"
  35.   html.css('span.sidelist--name').each do |list|
  36.     list_directory = "lists/#{list.text().strip()}"
  37.     FileUtils::mkdir_p(list_directory)
  38.     FileUtils::ln_sf(filepath, "#{list_directory}/#{image_filename}")
  39.   end
  40. end
  41.  
  42. # teengallery splits fullsize images up into vertical slices. This method stitches them together
  43. def save_fullsize_image(image_name, filepath)
  44.   # Retrieve image slice names
  45.   html = fetch_html("https://www.teengallery.com/fullsize.php?f=#{image_name}")
  46.   image_slices = html.css('img').collect {|img| img['src'].split('/').last()}
  47.  
  48.   # Fetch image slices
  49.   threads = []
  50.   image_slices.each do |slice_name|
  51.     threads.push(Thread.new {
  52.       url = "https://www.teengallery.com/fullslices/#{url_encode(slice_name)}"
  53.       connection = Excon.new(url)
  54.       response = connection.request(method:'GET', idempotent:true, retry_limit:5, headers:@headers)
  55.       File.open("tmp/#{slice_name}", 'w') {|f| f.write(response.body)}
  56.     })
  57.   end
  58.   threads.each(&:join)
  59.  
  60.   # Stitch together image slices and save to file
  61.   image_list = Magick::ImageList.new()
  62.   image_slices.each {|slice_name| image_list.read('tmp/' + slice_name) }
  63.   image_list.append(false).write(filepath)
  64.  
  65.   # Cleanup tmp files
  66.   FileUtils.rm_f(image_slices.map {|slice_name| 'tmp/' + slice_name})
  67. end
  68.  
  69. # Setup our working directory
  70. FileUtils.mkdir_p('images')
  71. FileUtils.mkdir_p('tmp')
  72.  
  73. # Determine what the latest image is so we know when to stop
  74. html = fetch_html('https://www.teengallery.com')
  75. image_limit = html.css('div.maincolumn--title h1').text().split('#').last.to_i()
  76.  
  77. progressbar = ProgressBar.create(:title => 'Downloading Images',
  78.                                  :total => image_limit,
  79.                                  :format => "%f %b\u{15E7}%i %p%% %t",
  80.                                  :progress_mark => ' ',
  81.                                  :remainder_mark => "\u{FF65}"
  82.                                 )
  83.  
  84. image_number = 1
  85. num_threads = 16
  86. while image_number <= image_limit
  87.   threads = []
  88.   (image_number..image_number+num_threads-1).each do |num|
  89.     next if num > image_limit
  90.     progressbar.increment()
  91.     threads.push(Thread.new {
  92.       # Retrieve and page contents and extract image data
  93.       html = fetch_html("https://www.teengallery.com/index.php?id=#{num}")
  94.       img = html.at_css('div#mainphoto img')
  95.       unless img.nil? || img['src'] == '/resized/'
  96.         img_name = img['src'].split('/').last()
  97.         image_filename = "%07d.jpg" % num
  98.         filepath = "#{@working_directory}/images/#{image_filename}"
  99.         save_fullsize_image(img_name, filepath)
  100.         create_image_lists(html)
  101.       end
  102.     })
  103.   end
  104.   threads.each(&:join)
  105.   image_number += num_threads
  106. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement