Advertisement
Guest User

Untitled

a guest
May 25th, 2018
246
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 3.16 KB | None | 0 0
  1. namespace :sportschools do
  2.   desc "Parse schools from sportschools.ru"
  3.   task parse: :environment do
  4.     WebScraping::SportschoolsService.new().call
  5.   end
  6. end
  7.  
  8. module WebScraping
  9.   class SportschoolsService
  10.     attr_reader :base_url
  11.  
  12.     def initialize
  13.       @base_url = "https://sportschools.ru/"
  14.     end
  15.  
  16.     def call
  17.       puts "City count: #{city_count}"
  18.  
  19.       cities do |city, pages|
  20.         puts "Create xlsx doc sportshools/#{city}"
  21.         book = create_xlsx(city)
  22.  
  23.         puts "Parse schools for a #{city}, pages_count: #{pages}"
  24.  
  25.         disposition = 1
  26.         pages(city, pages) do |page, page_num|
  27.  
  28.           schools(page) do |school, school_url|
  29.             write_xlsx(book, school_data(school), school_url, disposition)
  30.  
  31.             disposition += 1
  32.           end
  33.  
  34.           print "Page is written: #{page_num}\r"
  35.         end
  36.  
  37.         book.close
  38.         puts "Finished for #{city}"
  39.         puts "-------------------------------------------"
  40.       end
  41.     end
  42.  
  43.     private
  44.  
  45.     def open_page(url = "")
  46.       Nokogiri::HTML(open(base_url + url))
  47.     end
  48.  
  49.     def cities
  50.       open_page.css("div#cl_city.combo_list a.combo_list_item")[1..-1].each do |city_link|
  51.         begin
  52.           yield city_link["href"][6..-1], sportschool_pages_count(open_page(city_link["href"]))
  53.         rescue
  54.           next
  55.         end
  56.       end
  57.     end
  58.  
  59.     def city_count
  60.       open_page.css("div#cl_city.combo_list a.combo_list_item")[1..-1].count
  61.     end
  62.  
  63.     def pages city, pages_count
  64.       1.upto(pages_count) do |page|
  65.         begin
  66.           yield open_page("index_filtered.php?city=#{city}&page=#{page}").css("div.listings tr"), page
  67.         rescue
  68.           puts "Skipped broken #{page} for a #{city}"
  69.           next
  70.         end
  71.       end
  72.     end
  73.  
  74.     def schools page
  75.       page.css("tr").each do |school|
  76.         begin
  77.           yield open_page(school.css("a.title")[0]["href"]), base_url + school.css("a.title")[0]["href"]
  78.         rescue
  79.           next
  80.         end
  81.       end
  82.     end
  83.  
  84.     def sportschool_pages_count city_page
  85.       city_page.css("div.navigation span")[0].text.scan(/\d+$/)[0].to_i
  86.     end
  87.  
  88.     def school_data page
  89.       [
  90.         page.css("table.adr h1.fix.fn.org").text,
  91.         page.css("table.adr a.cat-url").map(&:text).join(', '),
  92.         page.css("table.adr td.url a.value").text.strip,
  93.         page.css("table.adr td.note").text,
  94.         page.css("table.adr span.street-address").text.strip,
  95.         page.css("table.adr span.email").text.strip,
  96.         page.css("table.adr b.tel").text.strip
  97.       ]
  98.     end
  99.  
  100.     def create_xlsx city
  101.       book = WriteXLSX.new "sportschools/#{city}.xlsx"
  102.       sheet = book.add_worksheet
  103.       %w(Название Категории Сайт Описание Адрес E-mail Телефон Url).each_with_index { |h, i| sheet.write 0, i, h }
  104.  
  105.       book
  106.     end
  107.  
  108.     def write_xlsx book, school, school_url, disposition
  109.       school.map.with_index(disposition) do |attr, row|
  110.         0.upto(school.count) do |col|
  111.           book.sheets[0].write row, col, attr
  112.         end
  113.  
  114.         book.sheets[0].write row, 7, school_url
  115.       end
  116.     end
  117.   end
  118. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement