Advertisement
Guest User

Untitled

a guest
Nov 30th, 2020
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.54 KB | None | 0 0
  1. import scrapy
  2.  
  3.  
  4. class Mingkh(scrapy.Spider):
  5.     name = 'houses'
  6.        
  7.     start_urls = ['https://dom.mingkh.ru/kemerovskaya-oblast/novokuzneck/houses?page=1']
  8.    
  9.     def parse(self, response):
  10.         house_page_links = response.css('td a::attr(href)')
  11.         yield from response.follow_all(house_page_links, self.parse_house)
  12.  
  13.         pagination_links = response.css('li a[rel="next"]::attr(href)')
  14.         yield from response.follow_all(pagination_links, self.parse)
  15.  
  16.     def parse_house(self, response):
  17.         def extract_with_css(query):
  18.             return response.css(query).get(default='').strip()
  19.  
  20.         company_urls = response.css('dd span.clickable::attr(data-url)')
  21.         yield from response.follow_all(company_urls, self.parse_company)
  22.  
  23.         yield {
  24.             'address': extract_with_css('div.block-heading-two h1::text'),
  25.             'url': response.url,
  26.  
  27.         }
  28.  
  29.     def parse_company(self, response):
  30.         def extract_with_css(query):
  31.             return response.css(query).get(default='').strip()
  32.  
  33.         def extract_with_xpath(query):
  34.             return response.xpath(query).get(default='').strip()
  35.  
  36.         yield {
  37.             'company_url': response.url,
  38.             'company_name': extract_with_css('div.block-heading-two h1::text'),
  39.             'company_phone': extract_with_xpath('/html[1]/body[1]/div[1]/div[2]/div[2]/div[3]/div[1]/dl[1]/dd[4]/text()'),
  40.             'company_address': extract_with_xpath('/html[1]/body[1]/div[1]/div[2]/div[2]/div[3]/div[1]/dl[1]/dd[3]/text()'),
  41.  
  42.         }
  43.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement