Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- namespace :sportschools do
- desc "Parse schools from sportschools.ru"
- task parse: :environment do
- WebScraping::SportschoolsService.new().call
- end
- end
- module WebScraping
- class SportschoolsService
- attr_reader :base_url
- def initialize
- @base_url = "https://sportschools.ru/"
- end
- def call
- puts "City count: #{city_count}"
- cities do |city, pages|
- puts "Create xlsx doc sportshools/#{city}"
- book = create_xlsx(city)
- puts "Parse schools for a #{city}, pages_count: #{pages}"
- disposition = 1
- pages(city, pages) do |page, page_num|
- schools(page) do |school, school_url|
- write_xlsx(book, school_data(school), school_url, disposition)
- disposition += 1
- end
- print "Page is written: #{page_num}\r"
- end
- book.close
- puts "Finished for #{city}"
- puts "-------------------------------------------"
- end
- end
- private
- def open_page(url = "")
- Nokogiri::HTML(open(base_url + url))
- end
- def cities
- open_page.css("div#cl_city.combo_list a.combo_list_item")[1..-1].each do |city_link|
- begin
- yield city_link["href"][6..-1], sportschool_pages_count(open_page(city_link["href"]))
- rescue
- next
- end
- end
- end
- def city_count
- open_page.css("div#cl_city.combo_list a.combo_list_item")[1..-1].count
- end
- def pages city, pages_count
- 1.upto(pages_count) do |page|
- begin
- yield open_page("index_filtered.php?city=#{city}&page=#{page}").css("div.listings tr"), page
- rescue
- puts "Skipped broken #{page} for a #{city}"
- next
- end
- end
- end
- def schools page
- page.css("tr").each do |school|
- begin
- yield open_page(school.css("a.title")[0]["href"]), base_url + school.css("a.title")[0]["href"]
- rescue
- next
- end
- end
- end
- def sportschool_pages_count city_page
- city_page.css("div.navigation span")[0].text.scan(/\d+$/)[0].to_i
- end
- def school_data page
- [
- page.css("table.adr h1.fix.fn.org").text,
- page.css("table.adr a.cat-url").map(&:text).join(', '),
- page.css("table.adr td.url a.value").text.strip,
- page.css("table.adr td.note").text,
- page.css("table.adr span.street-address").text.strip,
- page.css("table.adr span.email").text.strip,
- page.css("table.adr b.tel").text.strip
- ]
- end
- def create_xlsx city
- book = WriteXLSX.new "sportschools/#{city}.xlsx"
- sheet = book.add_worksheet
- %w(Название Категории Сайт Описание Адрес E-mail Телефон Url).each_with_index { |h, i| sheet.write 0, i, h }
- book
- end
- def write_xlsx book, school, school_url, disposition
- school.map.with_index(disposition) do |attr, row|
- 0.upto(school.count) do |col|
- book.sheets[0].write row, col, attr
- end
- book.sheets[0].write row, 7, school_url
- end
- end
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement