Guest User

www.ricardo.ch categories parser

a guest
Aug 15th, 2014
315
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 2.60 KB | None | 0 0
  1. require 'net/http'
  2. require 'rubygems'
  3. require 'bundler/setup'
  4. Bundler.require(:default)
  5.  
  6. def get_path(http, path, headers = {})
  7.   sleep(1)
  8.   puts "Loading #{path}\n"
  9.   req = Net::HTTP::Get.new(path, {
  10.       'Referer' => 'https://www.ricardo.ch/verkaufen/verkaufsformular/schritt1?src=btn_header_verkaufen&SSL=ON',
  11.       'X-Requested-With'=> 'XMLHttpRequest',
  12.       'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
  13.       'Cookie' => '__RequestVerificationToken_L05TRg2=kkGp2TsaWGlv9rFR5GOux9viivKMcod0JkJLplLPHr45Ou3iC1gsMvIR4fn017imgkP9of0oeQwybPRS889FEFM0Yyg1; BIGipServer~Ricardo_Prod~pool_webserver_ch_de=1041261578.18975.0000; ASP.NET_SessionId=131lk21sli3bxbmi5vkpblhz;',
  14.       'Accept' => '*/*'
  15.   }.merge(headers))
  16.  
  17.   res = http.request(req)
  18.   page = nil
  19.   case res
  20.     when Net::HTTPSuccess
  21.       begin
  22.         if res.header[ 'Content-Encoding' ].eql?( 'gzip' )
  23.           sio = StringIO.new( res.body )
  24.           gz = Zlib::GzipReader.new( sio )
  25.           page = gz.read()
  26.         else
  27.           page = res.body
  28.         end
  29.       rescue Exception
  30.         # handle errors
  31.         raise $!.message
  32.       end
  33.     else
  34.       raise res
  35.   end
  36.  
  37.   yield(page, res.header.to_hash)
  38. rescue
  39.   sleep(10)
  40.   get_path(http, path, headers)
  41. end
  42.  
  43. https = Net::HTTP.new('www.ricardo.ch', 443)
  44. https.use_ssl = true
  45. categories = {}
  46.  
  47. get_path(https, '/verkaufen/verkaufsformular/schritt1') do |page|
  48.   doc = Nokogiri.HTML(page)
  49.   doc.css('#navCatLvl1 li a').each do |element|
  50.     id = element['data-catid']
  51.     categories[id] = {
  52.         update_level: element['data-update-level'],
  53.         level: element['data-level'],
  54.         title: element['data-text'],
  55.         final: element['data-final-category'] == 'True',
  56.         subcategories: {}
  57.     }
  58.   end
  59. end
  60.  
  61. def load_category(https, id, hash)
  62.   cat = hash[id]
  63.   return if cat[:final]
  64.   get_path(https, "/verkaufen/verkaufsformular/getchildcategories/?id=#{id}&level=#{cat[:update_level]}") do |page|
  65.     doc = Nokogiri.HTML(page)
  66.     doc.css('.navCat ul li a').each do |element|
  67.       id = element['data-catid']
  68.       cat[:subcategories][id] = {
  69.           update_level: element['data-update-level'],
  70.           level: element['data-level'],
  71.           title: element['data-text'],
  72.           final: element['data-final-category'] == 'True',
  73.           subcategories: {}
  74.       }
  75.       load_category(https, id, cat[:subcategories])
  76.     end
  77.   end
  78. end
  79.  
  80. categories.each_pair do |id, hash|
  81.   load_category(https, id, categories)
  82. end
  83.  
  84. puts MultiJson.dump(categories)
Add Comment
Please, Sign In to add comment